# Fuzzy Matching with RL - 247 & Rivals

> Leveraging the RL library to determine approximate matching over a range of fields using various string methods methods.  This specifically focuses on 247 & Rivals first.

In [None]:
import json
import pandas
import time
import os
import recordlinkage
import csv
import core_constants as cc
import functions as fx
import sqlite3 as sql

#not currently using jellyfish
import jellyfish as jf

## Load and Merge Source Files Then Create a List of Dicts for each Dataset
> This was originally set up for all of the keys in the sourcefiles.json config.  Since this file is currently only going to serve 247 & Rivals, I've hardcoded the keys to fetch

In [None]:
dataset_keys = {'sports247': 1, 'rivals': 2}
for key,value in dataset_keys.items():
    SQL = """SELECT IDYR, College, Year, PlayerName, HighSchool, City, State, Position FROM SourcedPlayers WHERE KeyDataset = {}""".format(value)
    vars()[key] = (fx.connDBAndReturnDF(SQL)).set_index('IDYR')
    (vars()[key]).index.name = key + '_IDYR'
    (vars()[key])['IDYR'] = (vars()[key]).index.get_level_values(0)

In [None]:
sports247

Unnamed: 0_level_0,ID,College,Year,PlayerName,HighSchool,City,State,Position,IDYR
sports247_IDYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
paulthurston_nebraska_2012,paulthurston_nebraska,nebraska,2012,paulthurston,Arvada West,Arvada,CO,OT,paulthurston_nebraska_2012
gregmcmullen_nebraska_2012,gregmcmullen_nebraska,nebraska,2012,gregmcmullen,Archbishop Hoban,Akron,OH,SDE,gregmcmullen_nebraska_2012
jordanwesterkamp_nebraska_2012,jordanwesterkamp_nebraska,nebraska,2012,jordanwesterkamp,Montini Catholic,Lombard,IL,WR,jordanwesterkamp_nebraska_2012
imanicross_nebraska_2012,imanicross_nebraska,nebraska,2012,imanicross,North Hall,Gainesville,GA,RB,imanicross_nebraska_2012
tommyarmstrong_nebraska_2012,tommyarmstrong_nebraska,nebraska,2012,tommyarmstrong,Steele,Cibolo,TX,DUAL,tommyarmstrong_nebraska_2012
...,...,...,...,...,...,...,...,...,...
brettmedforth_troy_2013,brettmedforth_troy,troy,2013,brettmedforth,Sylacauga Sch,Sylacauga,AL,OG,brettmedforth_troy_2013
cardelllue_troy_2013,cardelllue_troy,troy,2013,cardelllue,Hebron,Carrollton,TX,CB,cardelllue_troy_2013
zachmoore_troy_2013,zachmoore_troy,troy,2013,zachmoore,Coffee,Douglas,GA,ILB,zachmoore_troy_2013
sethroberts_troy_2013,sethroberts_troy,troy,2013,sethroberts,Ariton Sch,Ariton,AL,WDE,sethroberts_troy_2013


## Create Blockers
> I swear this isn't working.  And honestly I guess I'm ok with it not working since I'm doing string operations later on this script.  This used to take longer than 20 minutes but has been fixed.

In [None]:
indexer = recordlinkage.BlockIndex(on=['College', 'Year'])
candidate_links = indexer.index(sports247, rivals)

## Define Features & Generate Comparison Vector Set

> These are all pretty straightforward.  The toughest one to assess is position - since the services don't always categorize players in the same way or have the same abbreviation for a single position.  Since this is often only a 2 or 3 letter string, I decided to do an exact match.

In [None]:
c = recordlinkage.Compare()

c.exact('IDYR', 'IDYR', label='IDYR')
c.string('PlayerName', 'PlayerName', method='damerau_levenshtein', label='PlayerName')
c.string('City', 'City', label='City')
c.exact('State', 'State', label='State')
c.string('HighSchool', 'HighSchool', label='HighSchool')
c.exact('Position', 'Position', label='Position')

try:
    features = c.compute(candidate_links, sports247, rivals)
except KeyError as e:
    print(e)

## Create Sum
> Final value will be between 0 and 1.  Sum represents a %confidence% level during fuzzy matching. Since an exact match on ID dictates a 100% confident match, I've removed that column from the sum.

In [None]:
features['Sum'] = (features['PlayerName'] + features['City'] + features['State'] + features['HighSchool']  + features['Position'])/5

In [None]:
features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,IDYR,PlayerName,City,State,HighSchool,Position,Sum
sports247_IDYR,rivals_IDYR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
paulthurston_nebraska_2012,tommyarmstrong_nebraska_2012,0,0.285714,0.0,0,0.090909,0,0.075325
paulthurston_nebraska_2012,alonzomoore_nebraska_2012,0,0.166667,0.0,0,0.125,0,0.058333
paulthurston_nebraska_2012,leroyalexander_nebraska_2012,0,0.0,0.166667,0,0.090909,0,0.051515
paulthurston_nebraska_2012,jaredafalava_nebraska_2012,0,0.083333,0.166667,0,0.090909,0,0.068182
paulthurston_nebraska_2012,mohammedseisay_nebraska_2012,0,0.071429,0.0,0,0.066667,0,0.027619


## Filter by Threshold

> Agreed upon confidence level for 247 & Rivals is .6943.  This takes longer than expected.  

In [None]:
count = 0
filteredList = []
noMatch = []

features['sports247_IDYR'] = features.index.get_level_values(0)
features['rivals_IDYR'] = features.index.get_level_values(1)

for idx, data in features.groupby(level=0):
    data = data.loc[data['Sum'].idxmax()]
    if (data['IDYR'] == 1):
        filteredList.append(data)
    elif (data['IDYR'] != 1 and data['Sum'] > .6943):
        filteredList.append(data)
    else:
        noMatch.append(data)

## Convert Series to Dataframe
> output of the above is a Pandas Series and needs to be pushed to a dataframe to be accessible

In [None]:
dfFinal = pandas.DataFrame()
dfFinal = dfFinal.append(filteredList)

In [None]:
dict_dfFinal = dfFinal.to_dict('records')

In [None]:
dict_dfFinal[0]

{'IDYR': 1,
 'PlayerName': 1.0,
 'City': 0.30000000000000004,
 'State': 0,
 'HighSchool': 1.0,
 'Position': 0,
 'Sum': 0.45999999999999996,
 'sports247_IDYR': 'aaironsavage_auburn_2005',
 'rivals_IDYR': 'aaironsavage_auburn_2005'}

In [None]:
for record in dict_dfFinal:
    Values = [record['sports247_IDYR'], record['rivals_IDYR'], 2, 3, record['Sum']]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence)
        VALUES (?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()

SyntaxError: 'return' outside function (<ipython-input-23-31ec0bbe9b19>, line 14)