In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np

In [2]:
def get_nan_info(dataframe):
    # Step 1: Get columns with NaN values
    nan_columns = dataframe.columns[dataframe.isna().any()].tolist()

    # Step 2: Calculate percentage of NaN values in each column
    nan_percentage = (dataframe.isna().sum() / len(dataframe)) * 100

    # Step 3: Combine the results into a DataFrame
    nan_info = pd.DataFrame({'NaN Count': dataframe.isna().sum(), 'NaN Percentage': nan_percentage})
    nan_info = nan_info.loc[nan_info['NaN Count'] > 0]  # Filter out columns with no NaNs

    return nan_info

## MAG

In [3]:
mag = pd.read_csv('/Users/arup/Documents/ISB/Datasets/MAG/Authors_disambiguated.csv')
mag

Unnamed: 0,entity_id,class,rank,foaf_name,paperCount,paperFamilyCount,citationCount
0,2707667312,Author,21013,Ben-Yuan Gu,1,1,2
1,2488541030,Author,21045,Jia Jian-guo,1,1,1
2,2735542882,Author,19066,Von Thurnher Wolfgang Loy,2,2,4
3,2141993737,Author,19430,Sheena M. Harris,1,1,33
4,2708286586,Author,21075,H. Ayeb,1,1,0
...,...,...,...,...,...,...,...
151350,2615733564,Author,19211,Jack Robinson,3,3,68
151351,2664127077,Author,21075,Yun Chan Huh,1,1,0
151352,2101324225,Author,21075,Jiayi Wu,1,1,0
151353,2119086371,Author,21075,Suguna Nantha Gopan,1,1,0


In [4]:
columns_to_drop = ['class', 'rank', 'paperCount', 'paperFamilyCount', 'citationCount']
mag.drop(columns=columns_to_drop, inplace=True)

mag

Unnamed: 0,entity_id,foaf_name
0,2707667312,Ben-Yuan Gu
1,2488541030,Jia Jian-guo
2,2735542882,Von Thurnher Wolfgang Loy
3,2141993737,Sheena M. Harris
4,2708286586,H. Ayeb
...,...,...
151350,2615733564,Jack Robinson
151351,2664127077,Yun Chan Huh
151352,2101324225,Jiayi Wu
151353,2119086371,Suguna Nantha Gopan


In [5]:
get_nan_info(mag)

Unnamed: 0,NaN Count,NaN Percentage


## Patent

In [6]:
patent = pd.read_csv('/Users/arup/Documents/ISB/Datasets/Patent/g_inventor_disambiguated.csv')
patent

Unnamed: 0,patent_id,inventor_sequence,inventor_id,disambig_inventor_name_first,disambig_inventor_name_last,male_flag,attribution_status,location_id
0,10254864,0,fl:ta_ln:kim-142,Tae-sang,Kim,1.0,1,7638d794-16c8-11ed-9b5f-1234bde3cd05
1,10164599,1,fl:yi_ln:wu-195,Yipin,Wu,1.0,1,8801128f-16c8-11ed-9b5f-1234bde3cd05
2,7884107,15,fl:cr_ln:gibeau-1,Craig R.,Gibeau,1.0,1,05dc2e20-16c8-11ed-9b5f-1234bde3cd05
3,9953797,1,fl:ma_ln:frontera-1,Mark Alan,Frontera,1.0,1,0736273d-16c8-11ed-9b5f-1234bde3cd05
4,9513572,3,fl:th_ln:anthony-4,Thomas,Anthony,1.0,1,a26e22db-16c8-11ed-9b5f-1234bde3cd05
...,...,...,...,...,...,...,...,...
20422,7673969,6,fl:ed_ln:moynihan-2,Edward R.,Moynihan,1.0,1,0837cef2-16c8-11ed-9b5f-1234bde3cd05
20423,5549079,5,fl:mi_ln:meuller-1,Michael L.,Meuller,1.0,1,f3f13ace-16c7-11ed-9b5f-1234bde3cd05
20424,9794514,0,fl:jo_ln:yoakum-1,John H.,Yoakum,1.0,1,6d380d52-16c8-11ed-9b5f-1234bde3cd05
20425,5401742,3,fl:to_ln:kokubu-8,Tomokuni,Kokubu,,99,8a6de2aa-16c8-11ed-9b5f-1234bde3cd05


In [7]:
columns_to_drop = ['inventor_sequence', 'inventor_id', 'male_flag', 'attribution_status', 'location_id']
patent.drop(columns=columns_to_drop, inplace=True)

patent

Unnamed: 0,patent_id,disambig_inventor_name_first,disambig_inventor_name_last
0,10254864,Tae-sang,Kim
1,10164599,Yipin,Wu
2,7884107,Craig R.,Gibeau
3,9953797,Mark Alan,Frontera
4,9513572,Thomas,Anthony
...,...,...,...
20422,7673969,Edward R.,Moynihan
20423,5549079,Michael L.,Meuller
20424,9794514,John H.,Yoakum
20425,5401742,Tomokuni,Kokubu


In [8]:
get_nan_info(patent)

Unnamed: 0,NaN Count,NaN Percentage
disambig_inventor_name_first,1,0.004895
disambig_inventor_name_last,1,0.004895


In [9]:
# Drop rows with any NaN values
patent.dropna(inplace=True)

# Reset index
patent.reset_index(drop=True, inplace=True)

In [10]:
get_nan_info(patent)

Unnamed: 0,NaN Count,NaN Percentage


In [11]:
patent

Unnamed: 0,patent_id,disambig_inventor_name_first,disambig_inventor_name_last
0,10254864,Tae-sang,Kim
1,10164599,Yipin,Wu
2,7884107,Craig R.,Gibeau
3,9953797,Mark Alan,Frontera
4,9513572,Thomas,Anthony
...,...,...,...
20420,7673969,Edward R.,Moynihan
20421,5549079,Michael L.,Meuller
20422,9794514,John H.,Yoakum
20423,5401742,Tomokuni,Kokubu


In [12]:
# Combine the two columns into a new column 'combined_column'
patent['name'] = patent['disambig_inventor_name_first'] + ' ' + patent['disambig_inventor_name_last']

# Display the updated DataFrame
patent

Unnamed: 0,patent_id,disambig_inventor_name_first,disambig_inventor_name_last,name
0,10254864,Tae-sang,Kim,Tae-sang Kim
1,10164599,Yipin,Wu,Yipin Wu
2,7884107,Craig R.,Gibeau,Craig R. Gibeau
3,9953797,Mark Alan,Frontera,Mark Alan Frontera
4,9513572,Thomas,Anthony,Thomas Anthony
...,...,...,...,...
20420,7673969,Edward R.,Moynihan,Edward R. Moynihan
20421,5549079,Michael L.,Meuller,Michael L. Meuller
20422,9794514,John H.,Yoakum,John H. Yoakum
20423,5401742,Tomokuni,Kokubu,Tomokuni Kokubu


In [13]:
# Load pre-trained sentence embeddings model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Preprocess academic titles with progress bar
mag_name = []
batch_size = 1000

# Get total number of batches
total_batches = len(mag) // batch_size + (1 if len(mag) % batch_size != 0 else 0)

with tqdm(total=total_batches, desc="Encoding academic titles") as pbar:
    for i in range(0, len(mag), batch_size):
        batch_mag = mag['foaf_name'][i:i+batch_size].tolist()
        mag_name.extend(model.encode(batch_mag))
        pbar.update(1)

# Initialize a list to store matched pairs
matched_pairs = []

Encoding academic titles: 100%|██████████| 152/152 [01:42<00:00,  1.48it/s]


In [14]:
# Iterate over batches of patent abstracts
for batch_idx, i in enumerate(range(0, len(patent), batch_size)):
    # Get batch of patent abstracts and corresponding IDs
    patent_name = patent['name'][i:i+batch_size].tolist()
    batch_patent_ids = patent['patent_id'][i:i+batch_size].tolist()

    # Encode batch of patent abstracts
    batch_patent_name = model.encode(patent_name)

    # Compute cosine similarity between batch of patent abstracts and academic titles
    similarities = cosine_similarity(batch_patent_name, mag_name)

    # Find indices with maximum similarity scores
    max_similarity_indices = np.argmax(similarities, axis=1)

    # Iterate over matched indices
    for j, idx in enumerate(max_similarity_indices):
        # Check if maximum similarity score is greater than threshold
        if similarities[j, idx] > 0.99:
            matched_pairs.append({'entity_id': mag.loc[idx, 'entity_id'], 
                                  'patent_id': batch_patent_ids[j],
                                  'mag_name': mag.loc[idx, 'foaf_name'],
                                  'patent_name': patent_name[j]})
        else:
            matched_pairs.append({'entity_id': None, 
                                  'patent_id': batch_patent_ids[j],
                                  'mag_name': None,
                                  'patent_name': patent_name[j]})

    # Update progress
    tqdm.write(f"Processed batch {batch_idx + 1}/{total_batches}")

Processed batch 1/152
Processed batch 2/152
Processed batch 3/152
Processed batch 4/152
Processed batch 5/152
Processed batch 6/152
Processed batch 7/152
Processed batch 8/152
Processed batch 9/152
Processed batch 10/152
Processed batch 11/152
Processed batch 12/152
Processed batch 13/152
Processed batch 14/152
Processed batch 15/152
Processed batch 16/152
Processed batch 17/152
Processed batch 18/152
Processed batch 19/152
Processed batch 20/152
Processed batch 21/152


In [15]:
# Create DataFrame from matched pairs
matched_pairs_df = pd.DataFrame(matched_pairs)
matched_pairs_df

Unnamed: 0,entity_id,patent_id,mag_name,patent_name
0,,10254864,,Tae-sang Kim
1,,10164599,,Yipin Wu
2,,7884107,,Craig R. Gibeau
3,,9953797,,Mark Alan Frontera
4,,9513572,,Thomas Anthony
...,...,...,...,...
20420,,7673969,,Edward R. Moynihan
20421,,5549079,,Michael L. Meuller
20422,,9794514,,John H. Yoakum
20423,,5401742,,Tomokuni Kokubu


In [16]:
get_nan_info(matched_pairs_df)

Unnamed: 0,NaN Count,NaN Percentage
entity_id,19964,97.742962
mag_name,19964,97.742962


In [17]:
# Create a new DataFrame with non-null values
matched_pairs_df = matched_pairs_df.dropna()
matched_pairs_df

Unnamed: 0,entity_id,patent_id,mag_name,patent_name
71,2.563750e+08,8132893,Kia Silverbrook,Kia Silverbrook
121,2.974377e+09,5360642,James M. O'Connor,James M. O'Connor
143,2.934740e+09,9965088,Yong Wu,Yong Wu
317,2.643532e+09,6282294,Michael J. O'Leary,Michael J. O'Leary
369,2.566232e+09,9613067,Daniel T. Chang,Daniel T. Chang
...,...,...,...,...
20100,2.471526e+09,11361422,Jing Xiao,Jing Xiao
20230,2.304129e+09,10687352,Tao Luo,Tao Luo
20275,2.116065e+09,7959101,Stephen Whitehead,Stephen Whitehead
20307,2.944809e+09,8163578,Heng Liu,Heng Liu


In [18]:
matched_pairs_df.reset_index(inplace=True, drop=True)
matched_pairs_df

Unnamed: 0,entity_id,patent_id,mag_name,patent_name
0,2.563750e+08,8132893,Kia Silverbrook,Kia Silverbrook
1,2.974377e+09,5360642,James M. O'Connor,James M. O'Connor
2,2.934740e+09,9965088,Yong Wu,Yong Wu
3,2.643532e+09,6282294,Michael J. O'Leary,Michael J. O'Leary
4,2.566232e+09,9613067,Daniel T. Chang,Daniel T. Chang
...,...,...,...,...
456,2.471526e+09,11361422,Jing Xiao,Jing Xiao
457,2.304129e+09,10687352,Tao Luo,Tao Luo
458,2.116065e+09,7959101,Stephen Whitehead,Stephen Whitehead
459,2.944809e+09,8163578,Heng Liu,Heng Liu


In [19]:
# Group by entity_id and aggregate other columns
aggregated_df = matched_pairs_df.groupby('entity_id').agg({
    'patent_id': lambda x: ', '.join(map(str, x)),
    'mag_name': 'first',  # Take the first value since they are related
    'patent_name': lambda x: ', '.join(map(str, x))
}).reset_index()

aggregated_df

Unnamed: 0,entity_id,patent_id,mag_name,patent_name
0,2.563750e+08,"8132893, 7973966, 7857536, 7557829, 6471331, 7...",Kia Silverbrook,"Kia Silverbrook, Kia Silverbrook, Kia Silverbr..."
1,2.011661e+09,5629082,Werner Schubert,Werner Schubert
2,2.020207e+09,7907360,UttHeng Kan,UttHeng Kan
3,2.040397e+09,9153581,Ming Zhu,Ming Zhu
4,2.099235e+09,4864623,Floris L. van Nes,Floris L. Van Nes
...,...,...,...,...
401,3.032134e+09,8271538,Lei He,Lei He
402,3.032396e+09,6032895,Li Zhao,Li Zhao
403,3.032932e+09,11244158,Liang Li,Liang Li
404,3.034338e+09,9166819,Yang Liu,Yang Liu


In [20]:
# Save the aggregated DataFrame to a new CSV file
aggregated_df.to_csv('/Users/arup/Documents/ISB/aggregated_name_csv_file.csv', index=False)