# In this code we regularize patent objects

## First, we make sure that the acquired patent object are complete, meaning:
   i - They all have closest patent objects filled.
   ii - Those who have closest patent object filled, their distance metrics should also be available.


### Define functions to load and save pickle files

In [5]:
# Define functions to load and save pickle files
import dill as pickle

def load(path):
    with open(path, mode = "rb") as f:
        file = pickle.load(f)
    return file
def save(path, file):
    with open(path, mode = "wb") as f:
        pickle.dump(file, f)

### Load id embeddings

In [None]:
import os

# Load ID embedding pairs
id_emb_path = r'00 Python data\01 CLS embeddings\CheckedPatents_CLSonly_2024.10.24_05_22.pkl'
id_embedding_pairs = load(id_emb_path)
print(f"Size of ID-Embedding pairs is: {len(id_embedding_pairs)}")







Size of ID-Embedding pairs is: 1859021


Size of ID-Embedding pairs is: 1859021


### Load field dicts and regularize

In [9]:
# Load field dict
import os 

field_dict_folder = r'00 Python data\02 Field dictionaries'
field_dict_paths = [os.path.join(field_dict_folder, o) for o in os.listdir(field_dict_folder)]

print(f"The number of field dictionaries is: {len(field_dict_paths)}")

new_dict = {}

# Regularize field dict
for path in field_dict_paths:
    field_dict = load(path)
    
    # unpack the dictionary
    for field, value in field_dict.items():
        for year, quasi_patents in value.items():
            
            print(f"Size of field dict {path} is: {len(quasi_patents)}")

            # Check if the field exists in the new dictionary
            if field not in new_dict:
                new_dict[field] = {}

            # Check if the year exists in the new dictionary
            if year not in new_dict[field]:
                new_dict[field][year] = {}

            if 'patents' and 'size' in new_dict[field][year]:

                size_of_quasi_patents_old = len(quasi_patents)

                assert size_of_quasi_patents_old == new_dict[field][year]['size'], f"Size of quasi-patents in new dict is {new_dict[field][year]['size']} and in old dict is {len(quasi_patents)}. They should be the same."
            

            else:

                # Set size
                new_dict[field][year]['size'] = len(field_dict[field][year])

                # Set patents
                new_dict[field][year]['patents'] = quasi_patents
                
                

The number of field dictionaries is: 27
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_36.pkl is: 148
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_36.pkl is: 84
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 148
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 112
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 81
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 92
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 104
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_38.pkl is: 148
Size of field dict 00 Python data\02 Field dictionaries\Field dict 

In [10]:
for field, year in new_dict.items():
    print(f"Years for field {field} are: {list(year.keys())}")

Years for field A45C are: ['1995', '2016']
Years for field G01G are: ['1994', '1997', '1995', '1996']
Years for field G07B are: ['1995', '1996']
Years for field G06Q are: ['1996', '2010', '2015', '2014', '2005', '2013', '2012', '2011', '2016', '2018', '2008', '2007', '2019', '2020', '2021', '2006', '2009', '2017']
Years for field G06F are: ['2008', '2012', '2011', '2013', '2016', '2005', '2009', '2010', '2006', '2014', '2007', '2015', '2003', '2018', '2019', '2017', '2020', '2021', '2022', '1995', '1999', '2000', '2001', '2004', '2002']
Years for field H01M are: ['1997', '1994']
Years for field C07C are: ['2006']
Years for field G01N are: ['1993', '2016']
Years for field C08F are: ['1996']
Years for field B01D are: ['2011']
Years for field C01G are: ['1994']
Years for field C08G are: ['2000']
Years for field B29C are: ['1997']
Years for field C08K are: ['1993']
Years for field C22B are: ['1992']
Years for field C07F are: ['1993']
Years for field C23C are: ['1994']
Years for field G01R 

In [14]:
# Load patent objects
patent_folder = r"00 Python data\Drive\03 Patents with pairs (group checked)"
acquired_patent_paths = [os.path.join(patent_folder, o) for o in os.listdir(patent_folder) if o.endswith("before.pkl")]
print(f"Number of acquired patent objects is: {len(acquired_patent_paths)}")


print(acquired_patent_paths)

Number of acquired patent objects is: 189
['00 Python data\\Drive\\03 Patents with pairs (group checked)\\Adometry, Inc._2014-05-05_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\AKAMAI TECHNOLOGIES, INC._1999-05-31_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Alexa Internet_1999-06-09_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\America Online, Inc._2006-03-31_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Android Corporation_2005-07-30_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Anvato, Inc._2016-07-07_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Apigee Corporation_2016-11-09_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Apple Computer Inc._1997-08-05_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Applied Semantics, Inc._2003-04-22_before.pkl', '

Number of acquired patent objects is: 189
['00 Python data\\Drive\\03 Patents with pairs (group checked)\\Adometry, Inc._2014-05-05_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\AKAMAI TECHNOLOGIES, INC._1999-05-31_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Alexa Internet_1999-06-09_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\America Online, Inc._2006-03-31_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Android Corporation_2005-07-30_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Anvato, Inc._2016-07-07_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Apigee Corporation_2016-11-09_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Apple Computer Inc._1997-08-05_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Applied Semantics, Inc._2003-04-22_before.pkl', '

In [None]:
# 

counter_closest_a = 0
counter_closest_na = 0
