# RecordLinkage Linking

> Leveraging the RL library to determine approximate matching over a range of fields using various methods.

In [1]:
import json
import pandas
import time
import os
import recordlinkage
import csv
import core_constants as cc
import functions as fx
from functools import reduce

#not currently using jellyfish
import jellyfish as jf

## Set up the Dataframes

In [2]:
outputDir = '..//scrapedData//'
field_agg = "_"

## Load the source file dict
sourceFiles = json.loads(open('..//config//sourceFiles.json', "r").read())

## Load the id config
idConfig = json.loads(open('..//config//idConfigLink.json', "r").read())

In [3]:
## Cycle through all of files per source and drop into a list of dicts
## Printing list names for reference sake
dfs = []
for key in sourceFiles.keys():
    vars()[key] = fx.mergeSourceFiles (key, outputDir, sourceFiles)    
    dfs.append(vars()[key])

In [4]:
fx.createNewID(idConfig['sports247'], sports247, field_agg)
fx.createNewID(idConfig['rivals'], rivals, field_agg)

['playerName', 'school', 'year']
['playerName', 'school', 'year']


In [5]:
df_sports247 = pandas.DataFrame(sports247)
df_rivals = pandas.DataFrame(rivals)

In [6]:
df_sports247 = pandas.DataFrame(sports247).set_index('ID')
df_rivals = pandas.DataFrame(rivals).set_index('ID')
df_sports247.index.name = '247_ID'
df_sports247.insert(0, 'ID', df_sports247.index)
df_rivals.index.name = 'rivals_ID'
df_rivals.insert(0, 'ID', df_rivals.index)

## Import the Pickle file

In [7]:
features = pandas.read_pickle("final_review.pkl")


In [None]:
## This is absolutely necessary if you want to access the IDs through a dictionary.  
## Without it the 247_ID and rivals_ID won't carry forward
#features.reset_index(inplace=True)
#features.head()
#features = features.to_dict('records')

In [10]:
df_sports247.reset_index(inplace=True)

In [11]:
sportDict = df_sports247.to_dict('records')

In [12]:
finalSet = []
for record in sportDict:
    finalRecord = {}
    for key,value in record.items():
        finalRecord[key] = value
    
    try:
        multiResults = features.loc[features['247_ID'] == record['247_ID']]
        #print(multiResults['rivals'])
        rivalsID = multiResults['rivals'].values
        
        if (multiResults['ID'].max() == 1):
            finalRecord['confidence'] = 1
        else:
            finalRecord['confidence'] = multiResults['sum'].max()
    except Exception as e:
        #print(multiResults)
        #print(e)
        finalRecord['error'] = 'No feature found with that ID'

    try:
        #print(rivalsID)
        rivalsResult = (df_rivals.loc[df_rivals['ID'] == rivalsID[0]]).to_dict()
        for key, value in rivalsResult.items():
            finalRecord['r_' + key] = value[rivalsID[0]]
    except Exception as e:
        #print(e + ': ' + str(rivalsID[0]))
        finalRecord['error'] = 'No rivals match found'
    finalSet.append(finalRecord)


In [13]:
df_finalSet = pandas.DataFrame(finalSet)

In [14]:
df_finalSet['ID'] = df_finalSet['ID'].str[:-5]
df_finalSet.to_csv('review.csv')  

In [15]:
summarizedDir = '..//summarizedData//'

## Load the source file dict
summarizedFiles = json.loads(open('..//config//summarizedFiles.json', "r").read())

In [16]:
dfs = []
keys = ['allConf','allAmerican','nflData','ncaa']
for key in keys:
    print(key)
    vars()[key] = pandas.DataFrame(json.loads(open(summarizedDir + summarizedFiles[key][0], "r", encoding="utf-8").read()))
    dfs.append(vars()[key])
    

allConf
allAmerican
nflData
ncaa


In [17]:
dfs.insert(0, df_finalSet)

In [18]:
df_final = reduce(lambda left,right: pandas.merge(left,right,how="left", on='ID'), dfs)
df_final.head()

Unnamed: 0,247_ID,ID,school,year,playerName,highSchool,city,state,position,height,...,draft_pick,team_y,pos,all_pros_first_team,pro_bowls,years_as_primary_starter,g,draft_year,ncaa_gamesPlayed,ncaa_gamesStarted
0,paulthurston_nebraska_2012,paulthurston_nebraska,nebraska,2012,paulthurston,Arvada West,Arvada,CO,OT,77,...,,,,,,,,,,
1,gregmcmullen_nebraska_2012,gregmcmullen_nebraska,nebraska,2012,gregmcmullen,Archbishop Hoban,Akron,OH,SDE,77,...,,,,,,,,,,
2,jordanwesterkamp_nebraska_2012,jordanwesterkamp_nebraska,nebraska,2012,jordanwesterkamp,Montini Catholic,Lombard,IL,WR,73,...,,,,,,,,,,
3,imanicross_nebraska_2012,imanicross_nebraska,nebraska,2012,imanicross,North Hall,Gainesville,GA,RB,73,...,,,,,,,,,,
4,tommyarmstrong_nebraska_2012,tommyarmstrong_nebraska,nebraska,2012,tommyarmstrong,Steele,Cibolo,TX,DUAL,73,...,,,,,,,,,,


In [19]:
with open("..//linkedPlayers.csv", "w", encoding="utf-8") as write_file:
                write_file.write(df_final.to_csv())

In [20]:
print(dfs)

[                               247_ID                         ID    school  \
0          paulthurston_nebraska_2012      paulthurston_nebraska  nebraska   
1          gregmcmullen_nebraska_2012      gregmcmullen_nebraska  nebraska   
2      jordanwesterkamp_nebraska_2012  jordanwesterkamp_nebraska  nebraska   
3            imanicross_nebraska_2012        imanicross_nebraska  nebraska   
4        tommyarmstrong_nebraska_2012    tommyarmstrong_nebraska  nebraska   
...                               ...                        ...       ...   
50918         brettmedforth_troy_2013         brettmedforth_troy      troy   
50919            cardelllue_troy_2013            cardelllue_troy      troy   
50920             zachmoore_troy_2013             zachmoore_troy      troy   
50921           sethroberts_troy_2013           sethroberts_troy      troy   
50922          jalendaniels_troy_2013          jalendaniels_troy      troy   

       year        playerName        highSchool         city s