# Source: Sports Reference - NFL Data

In [1]:
#hide
import json
import core_constants as cc
import functions as fx
import pandas as pd
import sqlite3 as sql
import recordlinkage
import queries

## Set Notebook Settings

In [2]:
#years = cc.get_defYears()
years = [2021]
headers= cc.get_header()
schoolsList = cc.get_schoolsList()
dataset = 'NFL'

## Get, Process the NFL draft data from SR
#### Source: https://www.pro-football-reference.com/years/2017/draft.htm
> This page contains metadata of each draft pick, both in terms of draft position but also current pro stats.

In [None]:
with open("..//scrapedData//nfldraft_2021.json", "w", encoding="utf-8") as write_file:
                write_file.write(json.dumps(fx.handle_nflData(years, headers, schoolsList)))

## Clear DB
> Useful for a clean start.  This removes all of the records for this dataset from the following structures: SourcedPlayers, RecordLinks.  All of the Views auto-cleanse themselves.

In [None]:
fx.clearDB(dataset)

## Save to DB

In [None]:
fx.toDB_NFLDraft()

## Strict Matching
> This saves it to RecordLinking where ID == ID, but returns IDYR as the matching target

In [None]:
fx.literalLinking(dataset)

## Fuzzy Matching w/ Threshold

> This is automatically pushing fuzzy matches above a certain threshold into the DB without the need for review [last part isn't true!  not automatically writing currently]

In [3]:
fuzzyDF = fx.doFuzzyMatching(dataset, 'Sports247')
fuzzyDF.sort_values(by='sum', ascending=False)

Unnamed: 0,sourceID,targetID,ID,PlayerName,StandardizedPosition,KeyPositionGroup,Year,sum
"(averywilliams_boisestate, averywillliams_boisestate_2016)",averywilliams_boisestate,averywillliams_boisestate_2016,0.96,0.928571,0.5,0.5,0,0.577714
"(rachadwildgoose_wisconsin, rachadwildgoosejr_wisconsin_2018)",rachadwildgoose_wisconsin,rachadwildgoosejr_wisconsin_2018,0.925926,0.882353,0.5,0.5,0,0.561656
"(terracemarshalljr_lsu, terracemarshall_lsu_2018)",terracemarshalljr_lsu,terracemarshall_lsu_2018,0.904762,0.882353,0.5,0.5,0,0.557423
"(earnestbrowniv_northwestern, earnestbrown_northwestern_2017)",earnestbrowniv_northwestern,earnestbrown_northwestern_2017,0.925926,0.857143,0.5,0.5,0,0.556614
"(gregnewsomeii_northwestern, gregnewsome_northwestern_2018)",gregnewsomeii_northwestern,gregnewsome_northwestern_2018,0.923077,0.846154,0.5,0.5,0,0.553846
"(michaelcarter_duke, michaelcarterii_duke_2017)",michaelcarter_duke,michaelcarterii_duke_2017,0.9,0.866667,0.5,0.5,0,0.553333
"(karyvincentjr_lsu, karyvincent_lsu_2017)",karyvincentjr_lsu,karyvincent_lsu_2017,0.882353,0.846154,0.5,0.5,0,0.545701
"(richardlecounte_georgia, richardlecounteiii_georgia_2017)",richardlecounte_georgia,richardlecounteiii_georgia_2017,0.884615,0.833333,0.5,0.5,0,0.54359
"(larryrountree_missouri, larryrountreeiii_missouri_2017)",larryrountree_missouri,larryrountreeiii_missouri_2017,0.88,0.8125,0.5,0.5,0,0.5385
"(williamsherman_colorado, willsherman_colorado_2017)",williamsherman_colorado,willsherman_colorado_2017,0.869565,0.785714,0.5,0.5,0,0.531056


## Create the Annotation File

> This changes the dataframe into a MultiIndex data frame that the annotation function requires

In [4]:
conn = sql.connect(cc.databaseName) 
          
sql_query = pd.read_sql_query ('''
                               SELECT
                                   a.IDYR,
                                   a.PlayerName,
                                   a.College,
                                   a.year,
                                   b.StandardizedPosition,
                                   b.KeyPositionGroup
                               FROM SourcedPlayers as a
                                   inner join Positions as b
                                       ON a.Position = b.Position
                               WHERE a.KeyDataSet = 1
                              
                               ''', conn)

df_247 = pd.DataFrame(sql_query, columns = ['IDYR', 'PlayerName', 'Year','College', 'KeyPositionGroup', 'StandardizedPosition'])
df_247.set_index('IDYR', append=False, inplace=True)

year = 2021
query = queries.get_query_UnlinkedNFL(year)

df_nfl = (fx.connDBAndReturnDF(query)).set_index('ID')
fuzzyMI = pd.MultiIndex.from_frame(fuzzyDF)
recordlinkage.write_annotation_file(
    "../Annotations/Annotations/annotation_nfl.json",
    fuzzyMI[0:300],
    df_nfl,
    df_247,
    dataset_a_name="NFL",
    dataset_b_name="Master"
)

Unnamed: 0_level_0,PlayerName,Year,College,KeyPositionGroup,StandardizedPosition
IDYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
paulthurston_nebraska_2012,paulthurston,2012,nebraska,1,OL
gregmcmullen_nebraska_2012,gregmcmullen,2012,nebraska,2,DE
jordanwesterkamp_nebraska_2012,jordanwesterkamp,2012,nebraska,1,WR
imanicross_nebraska_2012,imanicross,2012,nebraska,1,RB
tommyarmstrong_nebraska_2012,tommyarmstrong,2012,nebraska,1,QB
...,...,...,...,...,...
brettmedforth_troy_2013,brettmedforth,2013,troy,1,OL
cardelllue_troy_2013,cardelllue,2013,troy,2,DB
zachmoore_troy_2013,zachmoore,2013,troy,2,LB
sethroberts_troy_2013,sethroberts,2013,troy,2,DE


MultiIndex([('aaronrobinson_centralflorida', ...),
            (    'averywilliams_boisestate', ...),
            (   'benjaminstjuste_minnesota', ...),
            ( 'earnestbrowniv_northwestern', ...),
            (  'gregnewsomeii_northwestern', ...),
            (           'karyvincentjr_lsu', ...),
            (      'larryrountree_missouri', ...),
            (          'michaelcarter_duke', ...),
            (         'nickbolton_missouri', ...),
            (   'rachadwildgoose_wisconsin', ...),
            (     'richardlecounte_georgia', ...),
            (       'terracemarshalljr_lsu', ...),
            (     'trenixon_centralflorida', ...),
            (           'trevonmoehrig_tcu', ...),
            (     'williamsherman_colorado', ...)],
           names=['sourceID', 'targetID', 'ID', 'PlayerName', 'StandardizedPosition', 'KeyPositionGroup', 'Year', 'sum'])


## Read in the Annotation File
> Take the resulting Annotation file after handling the processing and insert it into the right table

In [7]:
annotation = recordlinkage.read_annotation_file("..//Annotations//Results//result_nfl.json")
try:
    annotation_dict = (annotation.links).to_flat_index()
except Exception as e:
    print(e)

## Insert Annotations to RecordLinks

In [9]:
for record in annotation_dict:
    #MAKE SURE YOU UPDATE THE THIRD VALUE TO THE CORRECT KEYDATASET!!
    Values = [record[0], record[1], 3, 1, 1, 1]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, TargetKeyDataSet, KeyLinkType, LinkConfidence)
        VALUES (?,?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()