# All Conference Transfers

In [1]:
import json
import core_constants as cc
import functions as fx
import pandas as pd
import sqlite3 as sql
import recordlinkage

## Build out DataFrames

In [2]:
SQL = '''SELECT * from Transfers247'''
df_247 = (fx.connDBAndReturnDF(SQL)).set_index('IDYR')
df_247.index.name = '247_IDYR'
df_247

Unnamed: 0_level_0,ID,PlayerName,Year,StandardizedPosition,KeyPositionGroup
247_IDYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
paulthurston_nebraska_2012,paulthurston_nebraska,paulthurston,2012,OL,1
gregmcmullen_nebraska_2012,gregmcmullen_nebraska,gregmcmullen,2012,DE,2
jordanwesterkamp_nebraska_2012,jordanwesterkamp_nebraska,jordanwesterkamp,2012,WR,1
imanicross_nebraska_2012,imanicross_nebraska,imanicross,2012,RB,1
tommyarmstrong_nebraska_2012,tommyarmstrong_nebraska,tommyarmstrong,2012,QB,1
...,...,...,...,...,...
dondrellharris_troy_2013,dondrellharris_troy,dondrellharris,2013,QB,1
jordanchunn_troy_2013,jordanchunn_troy,jordanchunn,2013,FB,1
bryanslater_troy_2013,bryanslater_troy,bryanslater,2013,DE,2
clarkquisenberry_troy_2013,clarkquisenberry_troy,clarkquisenberry,2013,WR,1


In [3]:
SQL = '''SELECT ID, PlayerName, Year, StandardizedPosition, KeyPositionGroup from UnlinkedAllConference'''
df_ac = (fx.connDBAndReturnDF(SQL)).set_index('ID')
df_ac.index.name = 'AllConf_ID'
df_ac

Unnamed: 0_level_0,PlayerName,Year,StandardizedPosition,KeyPositionGroup
AllConf_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
joeyashcroft_airforce,joeyashcroft,2002,K,3
anthonyschlegal_airforce,anthonyschlegal,2002,LB,2
wescrawley_airforce,wescrawley,2002,DB,2
marchellograddy_airforce,marchellograddy,2003,LB,2
drewfowler_airforce,drewfowler,2006,LB,2
...,...,...,...,...
dericyaussi_wyoming,dericyaussi,2004,K,3
zachmorris_wyoming,zachmorris,2004,DL,2
jovonbouknight_wyoming,jovonbouknight,2005,WR,1
devinmoore_wyoming,devinmoore,2008,RB,1


### NOTES

> You don't have position data, so you are doing the best you can here.  Blocking on exact name match isn't great, but you'll have to change how you collect from wikipedia to change this - cause you'll need to fetch Position data so you can derive position group and block on that.

In [4]:
indexer = recordlinkage.Index()
indexer = recordlinkage.BlockIndex(on=['KeyPositionGroup'])

In [5]:
pairs = indexer.index(df_ac, df_247)

In [6]:
from functions import YearNFL

sumFields = []
c = recordlinkage.Compare()
c.string('PlayerName', 'PlayerName', method='damerau_levenshtein', label='PlayerName')
sumFields.append('PlayerName')
c.exact('StandardizedPosition', 'StandardizedPosition', label='StandardizedPosition')
sumFields.append('StandardizedPosition')
c.exact('KeyPositionGroup', 'KeyPositionGroup', label='KeyPositionGroup')
sumFields.append('KeyPositionGroup')
c.add(YearNFL('Year', 'Year', label='Year'))
sumFields.append('Year')

In [7]:
features = c.compute(pairs, df_ac, df_247)

In [8]:
sum = 0
for field in sumFields:
    sum = sum + features[field]

features['Sum'] = sum / len(sumFields)

features = features[features['Year'] == 1.0]

features

Unnamed: 0_level_0,Unnamed: 1_level_0,PlayerName,StandardizedPosition,KeyPositionGroup,Year,Sum
AllConf_ID,247_IDYR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
zachzachpaul_akron,kylefederico_rutgers_2012,0.000000,0,1,1.0,0.500000
zachzachpaul_akron,connorkornbrath_iowa_2012,0.066667,0,1,1.0,0.516667
zachzachpaul_akron,tayborpepper_michiganstate_2012,0.166667,0,1,1.0,0.541667
zachzachpaul_akron,mattwile_michigan_2011,0.083333,0,1,1.0,0.520833
zachzachpaul_akron,thomasmeadows_purdue_2012,0.076923,1,1,1.0,0.769231
...,...,...,...,...,...,...
donbarclay_westvirginia,paulnichols_arizona_2005,0.090909,0,1,1.0,0.522727
donbarclay_westvirginia,kemontebateman_arizonastate_2008,0.357143,0,1,1.0,0.589286
donbarclay_westvirginia,cammontgomery_northtexas_2007,0.230769,0,1,1.0,0.557692
donbarclay_westvirginia,haroldmoleni_utahstate_2008,0.083333,0,1,1.0,0.520833


In [9]:
features.insert(0, 'sourceID', features.index.get_level_values(0))
features.insert(1, 'targetID', features.index.get_level_values(1))

filteredList = []
for idx, data in features.groupby(level=0):
    data = data.loc[data['Sum'].idxmax()]
    if (data['Sum'] > .6):
        filteredList.append(data)
dfFinal = pd.DataFrame()
dfFinal = dfFinal.append(filteredList)
dfFinal.to_csv("resultsAllConferenceTransfer.csv")

In [None]:
dfFinal

In [16]:
fuzzyMI = pd.MultiIndex.from_frame(dfFinal)
recordlinkage.write_annotation_file(
    "../Annotations/Annotations/annotation_ACTransfers.json",
    fuzzyMI[0:200],
    df_ac,
    df_247,
    dataset_a_name="All Conference",
    dataset_b_name="Master"
)

In [17]:
annotation = recordlinkage.read_annotation_file("..//Annotations//Results//result_ACTransfers.json")
try:
    annotation_dict = (annotation.links).to_flat_index()
except Exception as e:
    print(e)

In [19]:
for record in annotation_dict:
    #MAKE SURE YOU UPDATE THE THIRD VALUE TO THE CORRECT KEYDATASET!!
    Values = [record[0], record[1], 4, 1, 1, 1]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence, Transfer)
        VALUES (?,?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()