# In this code we regularize patent objects

## First, we make sure that the acquired patent object are complete, meaning:
   i - They all have closest patent objects filled.
   ii - Those who have closest patent object filled, their distance metrics should also be available.


### Define functions to load and save pickle files

In [5]:
# Define functions to load and save pickle files
import dill as pickle

def load(path):
    with open(path, mode = "rb") as f:
        file = pickle.load(f)
    return file
def save(path, file):
    with open(path, mode = "wb") as f:
        pickle.dump(file, f)

### Load id embeddings

In [None]:
import os

# Load ID embedding pairs
id_emb_path = r'00 Python data\01 CLS embeddings\CheckedPatents_CLSonly_2024.10.24_05_22.pkl'
id_embedding_pairs = load(id_emb_path)
print(f"Size of ID-Embedding pairs is: {len(id_embedding_pairs)}")







Size of ID-Embedding pairs is: 1859021


Size of ID-Embedding pairs is: 1859021


## Field dictionary regularization

### Load field dicts and regularize

In [9]:
# Load field dict
import os 

field_dict_folder = r'00 Python data\02 Field dictionaries'
field_dict_paths = [os.path.join(field_dict_folder, o) for o in os.listdir(field_dict_folder)]

print(f"The number of field dictionaries is: {len(field_dict_paths)}")

new_dict = {}

# Regularize field dict
for path in field_dict_paths:
    field_dict = load(path)
    
    # unpack the dictionary
    for field, value in field_dict.items():
        for year, quasi_patents in value.items():
            
            print(f"Size of field dict {path} is: {len(quasi_patents)}")

            # Check if the field exists in the new dictionary
            if field not in new_dict:
                new_dict[field] = {}

            # Check if the year exists in the new dictionary
            if year not in new_dict[field]:
                new_dict[field][year] = {}

            if 'patents' and 'size' in new_dict[field][year]:

                size_of_quasi_patents_old = len(quasi_patents)

                assert size_of_quasi_patents_old == new_dict[field][year]['size'], f"Size of quasi-patents in new dict is {new_dict[field][year]['size']} and in old dict is {len(quasi_patents)}. They should be the same."
            

            else:

                # Set size
                new_dict[field][year]['size'] = len(field_dict[field][year])

                # Set patents
                new_dict[field][year]['patents'] = quasi_patents
                
                

The number of field dictionaries is: 27
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_36.pkl is: 148
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_36.pkl is: 84
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 148
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 112
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 81
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 92
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_37.pkl is: 104
Size of field dict 00 Python data\02 Field dictionaries\Field dict - quasi patents_2024.10.21_23_38.pkl is: 148
Size of field dict 00 Python data\02 Field dictionaries\Field dict 

### Checks for field dictionary

In [15]:
total_size = 0

for field, value in new_dict.items():
    for year, quasi_patents in value.items():
        if 'patents' and 'size' in quasi_patents:

            assert len(quasi_patents['patents']) == quasi_patents['size'], f"Size of quasi-patents in new dict is {quasi_patents['size']} and in old dict is {len(quasi_patents['patents'])}. They should be the same."

            unique_ids = set()

            ids = [patent.patent_id for patent in quasi_patents['patents']]
            for id in ids:
                unique_ids.add(id)
            assert len(unique_ids) == quasi_patents['size'], f"Size of quasi-patents in new dict is {quasi_patents['size']} and in old dict is {len(unique_ids)}. They should be the same."

            total_size += quasi_patents['size']

print("Total size of the field dict is: ", total_size)


Total size of the field dict is:  1362908


### Save field dict

In [11]:
save(r"C:\Users\amusali\Desktop\uc3m PhD\05 Analysis\01 Main\00 Python data\02 Field dictionaries\Combined Field Dict.pkl", new_dict)

## Before patents regularization

In [14]:
import api.find_similar_patents





Number of acquired patent objects is: 189
['00 Python data\\Drive\\03 Patents with pairs (group checked)\\Adometry, Inc._2014-05-05_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\AKAMAI TECHNOLOGIES, INC._1999-05-31_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Alexa Internet_1999-06-09_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\America Online, Inc._2006-03-31_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Android Corporation_2005-07-30_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Anvato, Inc._2016-07-07_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Apigee Corporation_2016-11-09_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Apple Computer Inc._1997-08-05_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Applied Semantics, Inc._2003-04-22_before.pkl', '

Number of acquired patent objects is: 189
['00 Python data\\Drive\\03 Patents with pairs (group checked)\\Adometry, Inc._2014-05-05_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\AKAMAI TECHNOLOGIES, INC._1999-05-31_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Alexa Internet_1999-06-09_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\America Online, Inc._2006-03-31_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Android Corporation_2005-07-30_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Anvato, Inc._2016-07-07_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Apigee Corporation_2016-11-09_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Apple Computer Inc._1997-08-05_before.pkl', '00 Python data\\Drive\\03 Patents with pairs (group checked)\\Applied Semantics, Inc._2003-04-22_before.pkl', '

In [17]:
rr = load(r'C:\Users\amusali\Desktop\uc3m PhD\05 Analysis\01 Main\00 Python data\Drive\03 Patents with pairs (group checked)\Adometry, Inc._2014-05-05_before.pkl')


In [18]:
print(rr)

[Patent(patent_id='8533825', abstract='Embodiments disclosed herein provide a practical solution for click fraud detection. One embodiment of a method may comprise constructing representations of entities via a graph network framework. The representations, graphs or vector spaces, may capture information pertaining to clicks by botnets/click farms. To detect click fraud, each representation may be analyzed in the context of clustering, resulting in large data sets with respect to time, frequency, or gap between clicks. Highly accurate and highly scalable heuristics may be developed/applied to identify IP addresses that indicate potential collusion. One embodiment of a system having a computer program product implementing such a click fraud detection method may operate to receive a client file containing clicks gathered at the client side, construct representations of entities utilizing the graph framework described herein, perform clustering on the representations thus constructed, ide