In [1]:
import json
import core_constants as cc
import functions as fx
import pandas as pd
import sqlite3 as sql
import recordlinkage

## Building out the Dataframes

In [2]:
SQL = '''SELECT * from Transfers247_Unranked '''
df_247 = (fx.connDBAndReturnDF(SQL)).set_index('IDYR')
df_247.index.name = '247_IDYR'
df_247

Unnamed: 0_level_0,ID,PlayerName,Year,StandardizedPosition,KeyPositionGroup
247_IDYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
thomasbrown_nebraska_2012,thomasbrown_nebraska,thomasbrown,2012,LB,2
zaireanderson_nebraska_2012,zaireanderson_nebraska,zaireanderson,2012,LB,2
lamarrwoodley_michigan_2003,lamarrwoodley_michigan,lamarrwoodley,2003,LB,2
prescottburgess_michigan_2003,prescottburgess_michigan,prescottburgess,2003,DB,2
shawncrable_michigan_2003,shawncrable_michigan,shawncrable,2003,DE,2
...,...,...,...,...,...
brettmedforth_troy_2013,brettmedforth_troy,brettmedforth,2013,OL,1
cardelllue_troy_2013,cardelllue_troy,cardelllue,2013,DB,2
zachmoore_troy_2013,zachmoore_troy,zachmoore,2013,LB,2
sethroberts_troy_2013,sethroberts_troy,sethroberts,2013,DE,2


In [3]:
SQL = '''SELECT * from UnlinkedNCAA '''
df_ncaa = (fx.connDBAndReturnDF(SQL)).set_index('ID')
df_ncaa.index.name = 'NCAA_ID'
df_ncaa

Unnamed: 0_level_0,PlayerName,Year,StandardizedPosition,KeyPositionGroup
NCAA_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aadarkwah_massachusetts,aadarkwah,2017,WR,1
aadreekisconner_mississippistate,aadreekisconner,2019,DB,2
aarenvaughns_utahstate,aarenvaughns,2017,WR,1
aarionharvey_centralmichigan,aarionharvey,2020,OL,1
aaronabbott_easternmichigan,aaronabbott,2015,DB,2
...,...,...,...,...
zionjohnson_bostoncollege,zionjohnson,2019,OL,1
zionjohnson_louisville,zionjohnson,2019,OL,1
zionkelly_airforce,zionkelly,2020,DB,2
zionsales_missouri,zionsales,2019,DB,2


In [4]:
indexer = recordlinkage.Index()
indexer = recordlinkage.BlockIndex(on=['KeyPositionGroup'])


In [5]:
pairs = indexer.index(df_ncaa, df_247)

In [6]:
from functions import YearOther

sumFields = []
c = recordlinkage.Compare()
c.string('PlayerName', 'PlayerName', method='damerau_levenshtein', label='PlayerName')
sumFields.append('PlayerName')
c.exact('StandardizedPosition', 'StandardizedPosition', label='StandardizedPosition')
sumFields.append('StandardizedPosition')
c.exact('KeyPositionGroup', 'KeyPositionGroup', label='KeyPositionGroup')
sumFields.append('KeyPositionGroup')
c.add(YearOther('Year', 'Year', label='Year'))
sumFields.append('Year')


In [7]:
features = c.compute(pairs, df_ncaa, df_247)

In [8]:
sum = 0
for field in sumFields:
    sum = sum + features[field]

features['sum'] = sum / len(sumFields)

features = features[features['Year'] == 1.0]

features

Unnamed: 0_level_0,Unnamed: 1_level_0,PlayerName,StandardizedPosition,KeyPositionGroup,Year,sum
NCAA_ID,247_IDYR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aadarkwah_massachusetts,vancematthews_rutgers_2012,0.153846,1,1,1.0,0.788462
aadarkwah_massachusetts,bryanstonkus_rutgers_2012,0.083333,0,1,1.0,0.520833
aadarkwah_massachusetts,brandonarcidiacono_rutgers_2012,0.277778,0,1,1.0,0.569444
aadarkwah_massachusetts,boonemyers_iowa_2013,0.000000,0,1,1.0,0.500000
aadarkwah_massachusetts,connorkeane_iowa_2013,0.272727,1,1,1.0,0.818182
...,...,...,...,...,...,...
zekezaragoza_oklahomastate,evanrabon_coastalcarolina_2015,0.333333,0,1,1.0,0.583333
zekezaragoza_oklahomastate,willross_coastalcarolina_2019,0.166667,0,1,1.0,0.541667
zekezaragoza_oklahomastate,aaronsears_arkansasstate_2018,0.083333,0,1,1.0,0.520833
zekezaragoza_oklahomastate,noahkarwacki_southalabama_2019,0.166667,0,1,1.0,0.541667


In [9]:
features.insert(0, 'sourceID', features.index.get_level_values(0))
features.insert(1, 'targetID', features.index.get_level_values(1))

filteredList = []
for idx, data in features.groupby(level=0):
    data = data.loc[data['sum'].idxmax()]
    if (data['sum'] > .949):
        filteredList.append(data)
dfFinal = pd.DataFrame()
dfFinal = dfFinal.append(filteredList)
dfFinal.to_csv("resultsNCAATransfer.csv")

## Everything below wrote out Fuzzy Match above a specific threshold (.983)
> Next I have to review the next 500 records for matches

In [None]:
links = dfFinal.values.tolist()
links[0]

In [None]:
for record in links:
    #MAKE SURE YOU UPDATE THE THIRD VALUE TO THE CORRECT KEYDATASET!!
    Values = [record[0], record[1], 5, 3, record[5], 1]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence, Transfer)
        VALUES (?,?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()

## Below is to create and save annotations

In [None]:
fuzzyMI = pd.MultiIndex.from_frame(dfFinal)
recordlinkage.write_annotation_file(
    "../Annotations/Annotations/annotation_ACTransfersSept.json",
    fuzzyMI[0:150],
    df_ac,
    df_247,
    dataset_a_name="All Conference",
    dataset_b_name="Master"
)

In [None]:
annotation = recordlinkage.read_annotation_file("..//Annotations//Results//result-ncaatransfers.json")
try:
    annotation_dict = (annotation.links).to_flat_index()
except Exception as e:
    print(e)

In [None]:
print(annotation_dict[0])

In [None]:
for record in annotation_dict:
    #MAKE SURE YOU UPDATE THE THIRD VALUE TO THE CORRECT KEYDATASET!!
    Values = [record[0], record[1], 5, 1, 1, 1]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence, Transfer)
        VALUES (?,?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()