In [13]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np

In [14]:
def get_nan_info(dataframe):
    # Step 1: Get columns with NaN values
    nan_columns = dataframe.columns[dataframe.isna().any()].tolist()

    # Step 2: Calculate percentage of NaN values in each column
    nan_percentage = (dataframe.isna().sum() / len(dataframe)) * 100

    # Step 3: Combine the results into a DataFrame
    nan_info = pd.DataFrame({'NaN Count': dataframe.isna().sum(), 'NaN Percentage': nan_percentage})
    nan_info = nan_info.loc[nan_info['NaN Count'] > 0]  # Filter out columns with no NaNs

    return nan_info

In [15]:
mag = pd.read_csv('/Users/arup/Documents/ISB/mag_final.csv')
mag

Unnamed: 0,entity_id,processed_title
0,566794768,29p-aps-61 dy fe_ co_x _2の低温x線回折
1,2367711586,improv program evalu review techniqu base part...
2,2090358667,gorbachev gener secretari becom social democrat
3,2827751863,speed control dc motor
4,596046335,shakespear 's histori
...,...,...
16249,2062546659,coastal fring habitat threaten global warm
16250,2617488891,brasiguaio uma identidad na fronteira brasil/p...
16251,2772711505,activ torqu vector wheel drive fsae electr car
16252,2490723897,technic progress evolut


In [16]:
patent = pd.read_csv('/Users/arup/Documents/ISB/patent_final.csv')
patent

Unnamed: 0,patent_id,patent_title,uspc_mainclass_title,uspc_subclass_title,wipo_sector_title,wipo_field_title,processed_abstract
0,3969244,Method of adsorbing heavy metals,,,Chemistry,"Macromolecular chemistry, polymers",particul polycondens thiourea thiosemicarbazid...
1,5386430,Excimer laser processing method and apparatus,COHERENT LIGHT GENERATORS,Excimer or exciplex,Electrical engineering,Audio-visual technology,excim laser ablat process form via hole resin ...
2,6710441,"POWER SEMICONDUCTOR SWITCHING DEVICES, POWER C...","ACTIVE SOLID-STATE DEVICES (E.G., TRANSISTORS,...","All contacts on same surface (e.g., lateral st...",Electrical engineering,Semiconductors,power semiconductor switch devic power convert...
3,5441784,Paper base wallcoverings,STOCK MATERIAL OR MISCELLANEOUS ARTICLES,WALL AND SHELF COVERING,Mechanical engineering,Textile and paper machines,invent provid paper base wallcov option decor ...
4,6579624,Functional film having optical and electrical ...,STOCK MATERIAL OR MISCELLANEOUS ARTICLES,THREE DIMENSION IMITATION OR 'TREATED' NATURAL...,Electrical engineering,Audio-visual technology,function film includ transit layer first const...
...,...,...,...,...,...,...,...
39310,10121211,Waste analysis system and method,,,Electrical engineering,IT methods for management,embodi present invent provid techniqu identifi...
39311,6911447,Melanocortin receptor ligands,ORGANIC COMPOUNDS -- PART OF THE CLASS 532-570...,Quinoline or isoquinoline (including hydrogena...,Chemistry,Biotechnology,disclos mc-3/mc-4 receptor ligand ligand follo...
39312,10240186,"Devices, systems, and methods for magnetic sep...",,,Instruments,Analysis of biological materials,method microfluid devic instrument magnet sepa...
39313,5974763,Cell-inside-a-cell honeycomb material,STOCK MATERIAL OR MISCELLANEOUS ARTICLES,Honeycomb type cells extend perpendicularly to...,Mechanical engineering,Textile and paper machines,honeycomb insul panel describ wherein cell pan...


In [17]:
# Load pre-trained sentence embeddings model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Preprocess academic titles
academic_embeddings = model.encode(mag['processed_title'])

# Batch size for processing
batch_size = 1000

# Initialize a list to store matched pairs
matched_pairs = []

# Get total number of batches
total_batches = len(patent) // batch_size + (1 if len(patent) % batch_size != 0 else 0)

In [18]:
# Iterate over batches of patent abstracts
for batch_idx, i in enumerate(range(0, len(patent), batch_size)):
    # Get batch of patent abstracts and corresponding IDs
    batch_patent_abstracts = patent['processed_abstract'][i:i+batch_size].tolist()
    batch_patent_ids = patent['patent_id'][i:i+batch_size].tolist()

    # Encode batch of patent abstracts
    batch_patent_embeddings = model.encode(batch_patent_abstracts)

    # Compute cosine similarity between batch of patent abstracts and academic titles
    similarities = cosine_similarity(batch_patent_embeddings, academic_embeddings)

    # Find indices with maximum similarity scores
    max_similarity_indices = np.argmax(similarities, axis=1)

    # Iterate over matched indices
    for j, idx in enumerate(max_similarity_indices):
        # Check if maximum similarity score is greater than threshold
        if similarities[j, idx] > 0.75:
            matched_pairs.append({'entity_id': mag.loc[idx, 'entity_id'], 
                                  'patent_id': batch_patent_ids[j],
                                  'academic_title': mag.loc[idx, 'processed_title'],
                                  'patent_abstract': batch_patent_abstracts[j]})
        else:
            matched_pairs.append({'entity_id': None, 
                                  'patent_id': batch_patent_ids[j],
                                  'academic_title': None,
                                  'patent_abstract': batch_patent_abstracts[j]})

    # Update progress
    tqdm.write(f"Processed batch {batch_idx + 1}/{total_batches}")

Processed batch 1/40
Processed batch 2/40
Processed batch 3/40
Processed batch 4/40
Processed batch 5/40
Processed batch 6/40
Processed batch 7/40
Processed batch 8/40
Processed batch 9/40
Processed batch 10/40
Processed batch 11/40
Processed batch 12/40
Processed batch 13/40
Processed batch 14/40
Processed batch 15/40
Processed batch 16/40
Processed batch 17/40
Processed batch 18/40
Processed batch 19/40
Processed batch 20/40
Processed batch 21/40
Processed batch 22/40
Processed batch 23/40
Processed batch 24/40
Processed batch 25/40
Processed batch 26/40
Processed batch 27/40
Processed batch 28/40
Processed batch 29/40
Processed batch 30/40
Processed batch 31/40
Processed batch 32/40
Processed batch 33/40
Processed batch 34/40
Processed batch 35/40
Processed batch 36/40
Processed batch 37/40
Processed batch 38/40
Processed batch 39/40
Processed batch 40/40


In [19]:
# Create DataFrame from matched pairs
matched_pairs_df = pd.DataFrame(matched_pairs)
matched_pairs_df

Unnamed: 0,entity_id,patent_id,academic_title,patent_abstract
0,,3969244,,particul polycondens thiourea thiosemicarbazid...
1,,5386430,,excim laser ablat process form via hole resin ...
2,,6710441,,power semiconductor switch devic power convert...
3,,5441784,,invent provid paper base wallcov option decor ...
4,,6579624,,function film includ transit layer first const...
...,...,...,...,...
39310,,10121211,,embodi present invent provid techniqu identifi...
39311,,6911447,,disclos mc-3/mc-4 receptor ligand ligand follo...
39312,,10240186,,method microfluid devic instrument magnet sepa...
39313,,5974763,,honeycomb insul panel describ wherein cell pan...


In [20]:
get_nan_info(matched_pairs_df)

Unnamed: 0,NaN Count,NaN Percentage
entity_id,39272,99.890627
academic_title,39272,99.890627


In [21]:
# Create a new DataFrame with non-null values
matched_pairs_df = matched_pairs_df.dropna()
matched_pairs_df

Unnamed: 0,entity_id,patent_id,academic_title,patent_abstract
693,2882301000.0,11050289,power suppli control circuit power suppli cont...,provid power suppli circuit power suppli devic...
1525,1983066000.0,5703076,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin compound effect ...
2032,2923412000.0,7416789,semiconductor integr circuit devic electron devic,substrat semiconductor integr circuit compon i...
2175,1519656000.0,6165753,polypeptid nucleic acid molecul encod use,disclos nucleic acid molecul encod novel cycli...
2421,2849586000.0,10386681,liquid crystal align composit liquid crystal a...,liquid crystal display devic contain pair subs...
3949,1983066000.0,5510349,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin compound effect ...
4575,1983066000.0,5510349,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin compound effect ...
5036,1983066000.0,5622949,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin peptid compound ...
5113,1983066000.0,5510349,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin compound effect ...
6492,39789850.0,6447716,core type weld electrod,nickel-bas alloy use weld weld method weld ele...


In [22]:
matched_pairs_df.reset_index(inplace=True, drop=True)
matched_pairs_df

Unnamed: 0,entity_id,patent_id,academic_title,patent_abstract
0,2882301000.0,11050289,power suppli control circuit power suppli cont...,provid power suppli circuit power suppli devic...
1,1983066000.0,5703076,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin compound effect ...
2,2923412000.0,7416789,semiconductor integr circuit devic electron devic,substrat semiconductor integr circuit compon i...
3,1519656000.0,6165753,polypeptid nucleic acid molecul encod use,disclos nucleic acid molecul encod novel cycli...
4,2849586000.0,10386681,liquid crystal align composit liquid crystal a...,liquid crystal display devic contain pair subs...
5,1983066000.0,5510349,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin compound effect ...
6,1983066000.0,5510349,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin compound effect ...
7,1983066000.0,5622949,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin peptid compound ...
8,1983066000.0,5510349,ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin compound effect ...
9,39789850.0,6447716,core type weld electrod,nickel-bas alloy use weld weld method weld ele...


In [23]:
# Group by entity_id and aggregate other columns
aggregated_df = matched_pairs_df.groupby('entity_id').agg({
    'patent_id': lambda x: ', '.join(map(str, x)),
    'academic_title': 'first',  # Take the first value since they are related
    'patent_abstract': lambda x: ', '.join(map(str, x))
}).reset_index()

aggregated_df

Unnamed: 0,entity_id,patent_id,academic_title,patent_abstract
0,39789850.0,6447716,core type weld electrod,nickel-bas alloy use weld weld method weld ele...
1,1519656000.0,"6165753, 7390891",polypeptid nucleic acid molecul encod use,disclos nucleic acid molecul encod novel cycli...
2,1862620000.0,11342508,organ light emit materi organ light emit devic...,present specif relat organ light emit diod
3,1983066000.0,"5703076, 5510349, 5510349, 5622949, 5510349, 5...",ration design potent bioavail nonpeptid cyclic...,urea-contain hydroxyethylamin compound effect ...
4,2308314000.0,6644020,simul admiss exhaust process diesel engin,invent relat method regener particul filter ex...
5,2595577000.0,11367866,lithium ion batteri posit materi posit electro...,porou carbon particl posit electrod activ mate...
6,2612999000.0,10163324,assembl batteri monitor apparatu assembl batte...,remot batteri monitor configur base upon data ...
7,2758782000.0,6958367,coat composit compris starch,coat composit compris
8,2789938000.0,10966918,topic skin formul compris plant extract,disclos method treat skin topic skin composit ...
9,2813057000.0,10802363,manufactur method devic organ thin film transi...,array substrat manufactur method thereof displ...


In [24]:
# Save the aggregated DataFrame to a new CSV file
aggregated_df.to_csv('aggregated_csv_file.csv', index=False)