In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re 

In [2]:
#Combines structured and unstrucutred data using BoW CountVectorizer 

def process_data(x, y):
    # Process Words
    x_words = x['first_name'].map(str) + ' ' + x['middle_name'].map(str)  + ' ' + x['lastname'].map(str) + ' ' + x['topics'].map(str)
    y_words = y['first_name'].map(str) + ' ' + y['middle_name'].map(str)  + ' ' + y['lastname'].map(str) + ' ' + y['topics'].map(str)
    
    #Total corpus is combining x and y
    total_BOW = pd.concat([x_words, y_words], sort=False)
    
    vectorizer = TfidfVectorizer()
    total_vec = vectorizer.fit(total_BOW)
    X = total_vec.transform(x_words).todense()
    X_words = pd.DataFrame(X)

    Y = total_vec.transform(y_words).todense()
    Y_words = pd.DataFrame(Y)
    
    #Process numbers
    x_num = pd.DataFrame()
    y_num = pd.DataFrame()
    

    x_num['cities'] =  [re.findall("(\d+)",i) for i in x['cities'].map(str)]
    x_num['countries'] =  [re.findall("(\d+)",i) for i in x['countries'].map(str)]
    x_num = x_num.fillna(0, inplace=True)

    y_num['cities'] =  [re.findall("(\d+)",x) for x in y['cities'].map(str)]
    y_num['countries'] =  [re.findall("(\d+)",x) for x in y['countries'].map(str)]
    y_num = y_num.fillna(0, inplace=True)
    
    return X_words.append(x_num), Y_words.append(y_num)
   # final_Y = pd.DataFrame(Y_words.append(y_num))
   # return final_X, final_Y

In [3]:
#returns euclidean distance with weighted feature if orcid are the same

def calc_euclid(x, y, final_X, final_Y, truth_table):
    distance = abs(np.linalg.norm(final_X.iloc[x] - final_Y.iloc[y]))
    if x in truth_table.id_x.tolist() and y == temp_truth_df.id_y[temp_truth_df[temp_truth_df['id_x']==x].index.values.astype(int)[0]]:
        distance = distance - 10000
    return distance

In [4]:
# THis function creates truth table and includes euclid distance calculated

def create_truth_table(a, b, final_X, final_Y):
    dfa = a.drop_duplicates(subset=['orcid'])
    dfb = b.drop_duplicates(subset=['orcid'])
    dfa = dfa[np.isfinite(dfa['orcid'])]
    dfb = dfb[np.isfinite(dfb['orcid'])]
    new_df = pd.merge(dfa, dfb, how='inner', on='orcid')
    #new_df.drop(new_df.index[0], inplace=True)
    truth_df = new_df[['id_x','id_y']]
    
    #add this code in later to check euclid distance of truth table
    #truth_df['euclid'] = [calc_euclid(truth_df.id_x.iloc[i], truth_df.id_y.iloc[i], final_X, final_Y) for i in range(len(truth_df))]
    return truth_df

In [5]:
# Predict will run all of the functions in order and write a file to output
#predict(file_a, file_b):  
authors_df = pd.read_csv('authors_1019.csv', encoding='iso-8859-1')
investigators_df = pd.read_csv('investigators_837.csv', encoding='iso-8859-1')

final_X, final_Y = process_data(authors_df,investigators_df)

truth_df = create_truth_table(authors_df, investigators_df, final_X, final_Y)

results_df = pd.DataFrame()
results_df['author'] = authors_df.id - 1
#results_df.index = np.arange(0, len(results_df))

#Repair Index
# final_X.index += 1 
# final_Y.index += 1
# results_df.index += 1
# truth_df.index += 1
#export_csv = results_df.to_csv (r'ucb_output.csv', index = None, header=True)


In [6]:
results_df.head()

Unnamed: 0,author
0,0
1,1
2,2
3,3
4,4


In [7]:
# Create temporary truth table with correct x indexing
temp_truth_df = pd.DataFrame()
temp_truth_df['id_x'] = truth_df['id_x'] - 1 
temp_truth_df['id_y'] = truth_df['id_y'] 


In [8]:
truth_df.head()

Unnamed: 0,id_x,id_y
0,6,11
1,13,14
2,18,17
3,19,19
4,33,30


In [9]:
print(calc_euclid(0, 1, final_X, final_Y, truth_df))

1.39968985732117


In [10]:
result_list = []
euclid_list = []

for i in range(len(final_X)):
    min_index = 0
    min_euclid = 1000
    for j in range(len(final_Y)):
        x = calc_euclid(i, j, final_X, final_Y, temp_truth_df)
        if x < min_euclid:
            min_euclid = x
            min_index = j
            #print('i = ' + str(i) + ' j = ' + str(j) + ' min_euclid = ' + str(min_euclid) + ' min_index = ' + str(min_index))
    result_list.append(min_index)
        
  #  euclid_list.append(min_euclid)

In [11]:
results_df['investigator'] = result_list


In [12]:
results_df.author += 1
results_df.head(20)

Unnamed: 0,author,investigator
0,1,460
1,2,704
2,3,570
3,4,234
4,5,360
5,6,11
6,7,467
7,8,245
8,9,386
9,10,262


In [13]:
#accuracy assesment on truth data
score = 0
incorrect_x = []
incorrect_y = []
results_inv = []

for x in range(len(truth_df)):
    index_y = 0
    index_y = truth_df.iloc[x].id_x - 1
    if truth_df.iloc[x].id_y == results_df.iloc[index_y].investigator:
        score +=1
    elif truth_df.iloc[x].id_y != results_df.iloc[index_y].investigator:
        incorrect_x.append(truth_df.iloc[x].id_x)
        incorrect_y.append(truth_df.iloc[x].id_y)
        results_inv.append(results_df.iloc[index_y].investigator)

print('Accuracy on ground truth is ' + str(score / len(truth_df)*100) + '%')

Accuracy on ground truth is 100.0%


In [14]:
incorrect_df = pd.DataFrame([incorrect_x, incorrect_y, results_inv]).transpose()
incorrect_df.columns = ['true_author','true_investigator', 'incorrect_investigator']
incorrect_df.head()

Unnamed: 0,true_author,true_investigator,incorrect_investigator


In [15]:
predict('authors_1019.csv','investigators_837.csv')

NameError: name 'predict' is not defined