# All Conference Transfers

In [1]:
import json
import core_constants as cc
import functions as fx
import pandas as pd
import sqlite3 as sql
import recordlinkage

## Build out DataFrames

In [2]:
SQL = '''SELECT * from Transfers247'''
df_247 = (fx.connDBAndReturnDF(SQL)).set_index('IDYR')
df_247.index.name = '247_IDYR'
df_247

Unnamed: 0_level_0,ID,PlayerName,Year,StandardizedPosition,KeyPositionGroup
247_IDYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
paulthurston_nebraska_2012,paulthurston_nebraska,paulthurston,2012,OL,1
gregmcmullen_nebraska_2012,gregmcmullen_nebraska,gregmcmullen,2012,DE,2
jordanwesterkamp_nebraska_2012,jordanwesterkamp_nebraska,jordanwesterkamp,2012,WR,1
imanicross_nebraska_2012,imanicross_nebraska,imanicross,2012,RB,1
tommyarmstrong_nebraska_2012,tommyarmstrong_nebraska,tommyarmstrong,2012,QB,1
...,...,...,...,...,...
dondrellharris_troy_2013,dondrellharris_troy,dondrellharris,2013,QB,1
jordanchunn_troy_2013,jordanchunn_troy,jordanchunn,2013,FB,1
bryanslater_troy_2013,bryanslater_troy,bryanslater,2013,DE,2
clarkquisenberry_troy_2013,clarkquisenberry_troy,clarkquisenberry,2013,WR,1


In [15]:
SQL = '''SELECT ID, PlayerName, Year from UnlinkedAllConference'''
df_ac = (fx.connDBAndReturnDF(SQL)).set_index('ID')
df_ac.index.name = 'AllConf_ID'
df_ac

Unnamed: 0_level_0,PlayerName,Year
AllConf_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
bennysnelljr_kentucky,bennysnelljr,2017
martinasrankin_missst,martinasrankin,2017
haydenhurst_southcarolina,haydenhurst,2017
montezsweat_missst,montezsweat,2017
jefferysimmons_missst,jefferysimmons,2017
...,...,...
warddobbs_wyoming,warddobbs,2008
dmay_wyoming,dmay,2016
austinconway_wyoming,austinconway,2019
johnhoyland_wyoming,johnhoyland,2020


### NOTES

> You don't have position data, so you are doing the best you can here.  Blocking on exact name match isn't great, but you'll have to change how you collect from wikipedia to change this - cause you'll need to fetch Position data so you can derive position group and block on that.

In [6]:
indexer = recordlinkage.Index()
indexer = recordlinkage.BlockIndex(on=['PlayerName'])

In [7]:
pairs = indexer.index(df_ac, df_247)

In [8]:
from functions import YearNFL

sumFields = []
c = recordlinkage.Compare()
c.string('PlayerName', 'PlayerName', method='damerau_levenshtein', label='PlayerName')
sumFields.append('PlayerName')
#c.exact('StandardizedPosition', 'StandardizedPosition', label='StandardizedPosition')
#sumFields.append('StandardizedPosition')
#c.exact('KeyPositionGroup', 'KeyPositionGroup', label='KeyPositionGroup')
#sumFields.append('KeyPositionGroup')
c.add(YearNFL('Year', 'Year', label='Year'))
sumFields.append('Year')

In [10]:
features = c.compute(pairs, df_ac, df_247)

In [11]:
sum = 0
for field in sumFields:
    sum = sum + features[field]

features['Sum'] = sum / len(sumFields)

features = features[features['Year'] == 1.0]

features

Unnamed: 0_level_0,Unnamed: 1_level_0,PlayerName,Year,Sum
AllConf_ID,247_IDYR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
martinasrankin_missst,martinasrankin_mississippistate_2015,1.0,1.0,1.0
montezsweat_missst,montezsweat_michiganstate_2014,1.0,1.0,1.0
jrreed_georgia,jrreed_tulsa_2014,1.0,1.0,1.0
fredross_missst,fredross_mississippistate_2013,1.0,1.0,1.0
landondickerson_alabama,landondickerson_floridastate_2016,1.0,1.0,1.0
...,...,...,...,...
mikewhite_westernkentucky,mikewhite_southflorida_2013,1.0,1.0,1.0
keithbrown_westernkentucky,keithbrown_louisville_2012,1.0,1.0,1.0
bricengarner_westernmichigan,bricengarner_pittsburgh_2016,1.0,1.0,1.0
keithmixon_westernmichigan,keithmixon_mississippistate_2015,1.0,1.0,1.0


In [12]:
features.insert(0, 'sourceID', features.index.get_level_values(0))
features.insert(1, 'targetID', features.index.get_level_values(1))

filteredList = []
for idx, data in features.groupby(level=0):
    data = data.loc[data['Sum'].idxmax()]
    if (data['Sum'] > .94):
        filteredList.append(data)
dfFinal = pd.DataFrame()
dfFinal = dfFinal.append(filteredList)
dfFinal.to_csv("resultsAllConferenceTransfer.csv")

In [None]:
dfFinal

In [16]:
fuzzyMI = pd.MultiIndex.from_frame(dfFinal)
recordlinkage.write_annotation_file(
    "../Annotations/Annotations/annotation_ACTransfers.json",
    fuzzyMI[0:200],
    df_ac,
    df_247,
    dataset_a_name="All Conference",
    dataset_b_name="Master"
)

In [17]:
annotation = recordlinkage.read_annotation_file("..//Annotations//Results//result_ACTransfers.json")
try:
    annotation_dict = (annotation.links).to_flat_index()
except Exception as e:
    print(e)

In [19]:
for record in annotation_dict:
    #MAKE SURE YOU UPDATE THE THIRD VALUE TO THE CORRECT KEYDATASET!!
    Values = [record[0], record[1], 4, 1, 1, 1]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence, Transfer)
        VALUES (?,?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()