# Fuzzy Matching with RL - 247 & Rivals

> Leveraging the RL library to determine approximate matching over a range of fields using various string methods methods.  This specifically focuses on 247 & Rivals first.

In [1]:
import json
import pandas
import time
import os
import recordlinkage
import csv
import core_constants as cc
import functions as fx
import sqlite3 as sql

#not currently using jellyfish
import jellyfish as jf

## Load and Merge Source Files Then Create a List of Dicts for each Dataset
> This was originally set up for all of the keys in the sourcefiles.json config.  Since this file is currently only going to serve 247 & Rivals, I've hardcoded the keys to fetch

In [2]:
dfFinal = fx.doFuzzyMatching('NFL', 'Sports247')
dfFinal.head()

Unnamed: 0,sourceID,targetID,ID,PlayerName,Position,sum
"(aaroncurry_wakeforest, brandonterry_wakeforest_2010)",aaroncurry_wakeforest,brandonterry_wakeforest_2010,0.73913,0.5,0.0,0.413043
"(aaronlynch_southflorida, aaronharris_southflorida_2006)",aaronlynch_southflorida,aaronharris_southflorida_2006,0.75,0.454545,0.0,0.401515
"(aaronmerz_california, aaronrodgers_california_2003)",aaronmerz_california,aaronrodgers_california_2003,0.782609,0.583333,0.0,0.455314
"(aaronripkowski_oklahoma, aaronfranklin_oklahoma_2010)",aaronripkowski_oklahoma,aaronfranklin_oklahoma_2010,0.652174,0.428571,0.0,0.360248
"(abdulhodge_iowa, akrumwadley_iowa_2013)",abdulhodge_iowa,akrumwadley_iowa_2013,0.5625,0.363636,0.0,0.308712


In [3]:
conn = sql.connect(cc.databaseName) 
          
sql_query = pandas.read_sql_query ('''
                               SELECT
                               *
                               FROM SourcedPlayers
                               WHERE KeyDataSet = 1
                               ''', conn)

df_247 = pandas.DataFrame(sql_query, columns = ['IDYR', 'College', 'Year'])
df_247.set_index('IDYR', append=False, inplace=True)
sql_query = pandas.read_sql_query ('''
                               SELECT
                               *
                               FROM SourcedPlayers
                               WHERE KeyDataSet = 3
                               ''', conn)

df_NFL = pandas.DataFrame(sql_query, columns = ['ID', 'College', 'Year'])
df_NFL.set_index('ID', append=False, inplace=True)

#dfFinal.reset_index(drop=True, inplace=True)
#dfFinal.set_index(['sourceID', 'targetID'], append=False, inplace=True)

dfFinal.head()

Unnamed: 0,sourceID,targetID,ID,PlayerName,Position,sum
"(aaroncurry_wakeforest, brandonterry_wakeforest_2010)",aaroncurry_wakeforest,brandonterry_wakeforest_2010,0.73913,0.5,0.0,0.413043
"(aaronlynch_southflorida, aaronharris_southflorida_2006)",aaronlynch_southflorida,aaronharris_southflorida_2006,0.75,0.454545,0.0,0.401515
"(aaronmerz_california, aaronrodgers_california_2003)",aaronmerz_california,aaronrodgers_california_2003,0.782609,0.583333,0.0,0.455314
"(aaronripkowski_oklahoma, aaronfranklin_oklahoma_2010)",aaronripkowski_oklahoma,aaronfranklin_oklahoma_2010,0.652174,0.428571,0.0,0.360248
"(abdulhodge_iowa, akrumwadley_iowa_2013)",abdulhodge_iowa,akrumwadley_iowa_2013,0.5625,0.363636,0.0,0.308712


In [4]:
miFinal = pandas.MultiIndex.from_frame(dfFinal)
miFinal

MultiIndex([(         'aaroncurry_wakeforest', ...),
            (       'aaronlynch_southflorida', ...),
            (          'aaronmerz_california', ...),
            (       'aaronripkowski_oklahoma', ...),
            (               'abdulhodge_iowa', ...),
            (      'adamjennings_fresnostate', ...),
            (     'adamkieft_centralmichigan', ...),
            (               'adamseward_unlv', ...),
            (             'adamsnyder_oregon', ...),
            (            'adamterry_syracuse', ...),
            ...
            (       'williereid_floridastate', ...),
            (   'willmontgomery_virginiatech', ...),
            (       'willsutton_arizonastate', ...),
            (           'willsvitek_stanford', ...),
            (  'willwhitticker_michiganstate', ...),
            ('xaviercrawford_centralmichigan', ...),
            (         'zachdiles_kansasstate', ...),
            (       'zachmiller_arizonastate', ...),
            (       'zachstrie

In [5]:
recordlinkage.write_annotation_file(
    "annotation_nfl.json",
    miFinal[0:1100],
    df_NFL,
    df_247,
    dataset_a_name="NFL",
    dataset_b_name="Master"
)

In [None]:
dfFinal.head()

## Create Blockers
> I swear this isn't working.  And honestly I guess I'm ok with it not working since I'm doing string operations later on this script.  This used to take longer than 20 minutes but has been fixed.

In [None]:
indexer = recordlinkage.BlockIndex(on=['College', 'Year'])
candidate_links = indexer.index(sports247, rivals)

## Define Features & Generate Comparison Vector Set

> These are all pretty straightforward.  The toughest one to assess is position - since the services don't always categorize players in the same way or have the same abbreviation for a single position.  Since this is often only a 2 or 3 letter string, I decided to do an exact match.

In [None]:
c = recordlinkage.Compare()

c.exact('IDYR', 'IDYR', label='IDYR')
c.string('PlayerName', 'PlayerName', method='damerau_levenshtein', label='PlayerName')
c.string('City', 'City', label='City')
c.exact('State', 'State', label='State')
c.string('HighSchool', 'HighSchool', label='HighSchool')
c.exact('Position', 'Position', label='Position')

try:
    features = c.compute(candidate_links, sports247, rivals)
except KeyError as e:
    print(e)

## Create Sum
> Final value will be between 0 and 1.  Sum represents a %confidence% level during fuzzy matching. Since an exact match on ID dictates a 100% confident match, I've removed that column from the sum.

In [None]:
features['Sum'] = (features['PlayerName'] + features['City'] + features['State'] + features['HighSchool']  + features['Position'])/5

In [None]:
features.head()

## Filter by Threshold

> Agreed upon confidence level for 247 & Rivals is .6943.  This takes longer than expected.  

In [None]:
count = 0
filteredList = []
noMatch = []

features['sports247_IDYR'] = features.index.get_level_values(0)
features['rivals_IDYR'] = features.index.get_level_values(1)

for idx, data in features.groupby(level=0):
    data = data.loc[data['Sum'].idxmax()]
    if (data['IDYR'] == 1):
        filteredList.append(data)
    elif (data['IDYR'] != 1 and data['Sum'] > .6943):
        filteredList.append(data)
    else:
        noMatch.append(data)

## Convert Series to Dataframe
> output of the above is a Pandas Series and needs to be pushed to a dataframe to be accessible

In [None]:
dfFinal = pandas.DataFrame()
dfFinal = dfFinal.append(filteredList)
dfFinal.sort_values(by='SUM')

In [None]:
dict_dfFinal = dfFinal.to_dict('records')

In [None]:
dict_dfFinal[0]

In [None]:
for record in dict_dfFinal:
    Values = [record['sports247_IDYR'], record['rivals_IDYR'], 2, 3, record['Sum']]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence)
        VALUES (?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()