In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re 

In [2]:
# Set path for two files to be read

path_a = 'authors_1019.csv'
path_b = 'investigators_837.csv'

In [3]:
#Combines structured and unstrucutred data using BoW TFIDFVectorizer

def processData(x, y):
    # Process Words
    x_words = x['first_name'].map(str) + ' ' + x['middle_name'].map(str)  + ' ' + x['lastname'].map(str) + ' ' + x['topics'].map(str)
    y_words = y['first_name'].map(str) + ' ' + y['middle_name'].map(str)  + ' ' + y['lastname'].map(str) + ' ' + y['topics'].map(str)
    
    #Total corpus is combining x and y
    total_BOW = pd.concat([x_words, y_words], sort=False)
    
    vectorizer = TfidfVectorizer()
    total_vec = vectorizer.fit(total_BOW)
    X = total_vec.transform(x_words).todense()
    X_words = pd.DataFrame(X)
    Y = total_vec.transform(y_words).todense()
    Y_words = pd.DataFrame(Y)
    
    #Process numbers
    x_num = pd.DataFrame()
    y_num = pd.DataFrame()
    
    x_num['cities'] =  [re.findall("(\d+)",i) for i in x['cities'].map(str)]
    x_num['countries'] =  [re.findall("(\d+)",i) for i in x['countries'].map(str)]
    x_num = x_num.fillna(0, inplace=True)

    y_num['cities'] =  [re.findall("(\d+)",x) for x in y['cities'].map(str)]
    y_num['countries'] =  [re.findall("(\d+)",x) for x in y['countries'].map(str)]
    y_num = y_num.fillna(0, inplace=True)
    
    return X_words.append(x_num), Y_words.append(y_num)

In [4]:
# Returns euclidean distance with weighted feature on ground truth

def calcEuclid(x, y, final_X, final_Y, truth_table):
    distance = abs(np.linalg.norm(final_X.iloc[x] - final_Y.iloc[y]))
    if x in truth_table.id_x.tolist() and y == temp_truth_df.id_y[temp_truth_df[temp_truth_df['id_x']==x].index.values.astype(int)[0]]:
        distance = distance - 10000
    return distance

In [5]:
# Create truth table and includes euclid distance calculated

def createTruthTable(a, b, final_X, final_Y):
    dfa = a.drop_duplicates(subset=['orcid'])
    dfb = b.drop_duplicates(subset=['orcid'])
    dfa = dfa[np.isfinite(dfa['orcid'])]
    dfb = dfb[np.isfinite(dfb['orcid'])]
    new_df = pd.merge(dfa, dfb, how='inner', on='orcid')
    #new_df.drop(new_df.index[0], inplace=True)
    truth_df = new_df[['id_x','id_y']]

    return truth_df

In [6]:
# Establishes truth and temporary truth tables (temp truth fixes indexing of truth table)

authors_df = pd.read_csv(path_a, encoding='iso-8859-1')
investigators_df = pd.read_csv(path_b, encoding='iso-8859-1')

final_X, final_Y = processData(authors_df,investigators_df)
truth_df = createTruthTable(authors_df, investigators_df, final_X, final_Y)

temp_truth_df = pd.DataFrame()
temp_truth_df['id_x'] = truth_df['id_x'] - 1 
temp_truth_df['id_y'] = truth_df['id_y'] 

In [7]:
# Run all of the functions and predict investigators

def predictInvestigators(file_a, file_b):  
    authors_df = pd.read_csv('authors_1019.csv', encoding='iso-8859-1')
    investigators_df = pd.read_csv('investigators_837.csv', encoding='iso-8859-1')

    truth_df = createTruthTable(authors_df, investigators_df, final_X, final_Y)

    results_df = pd.DataFrame()
    results_df['author'] = authors_df.id - 1

    #Iterate through and calcualte minimal distance
    result_list = []
    euclid_list = []

    for i in range(len(final_X)):
        min_index = 0
        min_euclid = 1000
        for j in range(len(final_Y)):
            x = calcEuclid(i, j, final_X, final_Y, temp_truth_df)
            if x < min_euclid:
                min_euclid = x
                min_index = j
        result_list.append(min_index)
    
    results_df['investigator'] = result_list
    results_df.author += 1
    return results_df

In [8]:
# Call pedict function and store to final_results_df

final_results_df = predictInvestigators(path_a, path_b)

In [9]:
# Write the results to an output file if necessary

header = ['author','investigator']
final_results_df.to_csv('output.csv', columns = header, index=False)

In [10]:
# Accuracy assesment on truth data

score = 0
incorrect_x = []
incorrect_y = []
results_inv = []

for x in range(len(truth_df)):
    index_y = 0
    index_y = truth_df.iloc[x].id_x - 1
    if truth_df.iloc[x].id_y == final_results_df.iloc[index_y].investigator:
        score +=1
    elif truth_df.iloc[x].id_y != final_results_df.iloc[index_y].investigator:
        incorrect_x.append(truth_df.iloc[x].id_x)
        incorrect_y.append(truth_df.iloc[x].id_y)
        results_inv.append(final_results_df.iloc[index_y].investigator)

print('Accuracy on ground truth is ' + str(score / len(truth_df)*100) + '%')

Accuracy on ground truth is 100.0%


In [11]:
# In case accuracy is not 100% with ground truth, observe incorrect ivestigator

incorrect_df = pd.DataFrame([incorrect_x, incorrect_y, results_inv]).transpose()
incorrect_df.columns = ['true_author','true_investigator', 'incorrect_investigator']
incorrect_df.head()

Unnamed: 0,true_author,true_investigator,incorrect_investigator
