# RecordLinkage Linking - 247 & Rivals

> Leveraging the RL library to determine approximate matching over a range of fields using various string methods methods.  This specifically focuses on 247 & Rivals first.

In [1]:
import json
import pandas
import time
import os
import recordlinkage
import csv
import core_constants as cc
import functions as fx

#not currently using jellyfish
import jellyfish as jf

### I don't like having all of this here - should push to functions

In [2]:
outputDir = '..//scrapedData//'
field_agg = "_"

## Load the source file dict
sourceFiles = json.loads(open('..//config//sourceFiles.json', "r").read())

## Load the id config
idConfig = json.loads(open('..//config//idConfigLink.json', "r").read())

## Load and Merge Source Files Then Create a List of Dicts for each Dataset
> This was originally set up for all of the keys in the sourcefiles.json config.  Since this file is currently only going to serve 247 & Rivals, I've hardcoded the keys to fetch

In [3]:
dataset_keys = ['sports247', 'rivals']
dfs = []
for key in dataset_keys:
    vars()[key] = fx.mergeSourceFiles (key, outputDir, sourceFiles)    
    dfs.append(vars()[key])

## Create New IDs
> This isn't elegant and I'd love to basically preprocess these collections prior to this notebook

In [4]:
fx.createNewID(idConfig['sports247'], sports247, field_agg)
fx.createNewID(idConfig['rivals'], rivals, field_agg)

['playerName', 'school', 'year']
['playerName', 'school', 'year']


## Create Dataframes
> I might move these to their own pickles so I don't constantly have to recreate these each time

In [5]:
df_sports247 = pandas.DataFrame(sports247).set_index('ID')
df_rivals = pandas.DataFrame(rivals).set_index('ID')

## Rename the indexes to something readable and predictable
> Otherwise these become 'index' and 'level_0', etc when you start to merge these.

In [6]:
df_sports247.index.name = '247_ID'
df_sports247['ID'] = df_sports247.index
df_rivals.index.name = 'rivals_ID'
df_rivals['ID'] = df_rivals.index

## Create Blockers
> I swear this isn't working.  And honestly I guess I'm ok with it not working since I'm doing string operations later on this script.  This usually takes 20+ minutes - so it is doing something...

In [7]:
indexer = recordlinkage.BlockIndex(on=['school', 'year'])
candidate_links = indexer.index(df_sports247, df_rivals)

## Define Features & Generate Comparison Vector Set

> These are all pretty straightforward.  The toughest one to assess is position - since the services don't always categorize players in the same way or have the same abbreviation for a single position.  Since this is often only a 2 or 3 letter string, I decided to do an exact match.

In [8]:
c = recordlinkage.Compare()

c.exact('ID', 'ID', label='ID')
c.string('playerName', 'playerName', method='damerau_levenshtein', label='playerName')
# don't need this since you are now blocking on it
#c.string('school', 'school', label='college')
c.string('city', 'city', label='city')
c.exact('state', 'state', label='state')
c.string('highSchool', 'highSchool', label='highSchool')
c.exact('position', 'position', label='position')
#c.exact('year', 'year', label='year')

features = c.compute(candidate_links, df_sports247, df_rivals)

## Create Sum
> Final value will be between 0 and 1.  Sum represents a %confidence% level during fuzzy matching. Since an exact match on ID dictates a 100% confident match, I've removed that column from the sum.

In [9]:
features['sum'] = (features['playerName'] + features['city'] + features['state'] + features['highSchool']  + features['position'])/5

## Save to Pickle File

In [10]:
features.to_pickle("features.pkl")

## Optional: Visualize the Feature dataset

In [None]:
# Nice to see the structured outputs
features.describe()

In [None]:
# Fun to run but not necessary
features.boxplot(column=['playerName', 'city', 'state', 'highSchool', 'position'])

In [11]:
features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,playerName,city,state,highSchool,position,sum
247_ID,rivals_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
paulthurston_nebraska_2012,tommyarmstrong_nebraska_2012,0,0.285714,0.0,0,0.090909,0,0.075325
paulthurston_nebraska_2012,alonzomoore_nebraska_2012,0,0.166667,0.0,0,0.125,0,0.058333
paulthurston_nebraska_2012,leroyalexander_nebraska_2012,0,0.0,0.166667,0,0.090909,0,0.051515
paulthurston_nebraska_2012,jaredafalava_nebraska_2012,0,0.083333,0.166667,0,0.090909,0,0.068182
paulthurston_nebraska_2012,mohammedseisay_nebraska_2012,0,0.071429,0.0,0,0.066667,0,0.027619


In [13]:
count = 0
filteredList = []
noMatch = []

features['247_ID'] = features.index.get_level_values(0)
features['rivals'] = features.index.get_level_values(1)

for idx, data in features.groupby(level=0):
    data = data.loc[data['sum'].idxmax()]
    if (data['ID'] == 1):
        filteredList.append(data)
    elif (data['ID'] != 1 and data['sum'] > .6943):
        filteredList.append(data)
    else:
        noMatch.append(data)

In [14]:
dfFinal = pandas.DataFrame()
dfFinal = dfFinal.append(filteredList)

In [15]:
dfFinal.to_csv('final_review.csv')  
dfFinal.to_pickle('final_review.pkl')