# Source: Sports Reference - NFL Data

In [1]:
#hide
import json
import core_constants as cc
import functions as fx
import pandas as pd
import sqlite3 as sql
import recordlinkage

## Set Notebook Settings

In [2]:
years = cc.get_defYears()
headers= cc.get_header()
schoolsList = cc.get_schoolsList()
dataset = 'NFL'

## Get, Process the NFL draft data from SR
#### Source: https://www.pro-football-reference.com/years/2017/draft.htm
> This page contains metadata of each draft pick, both in terms of draft position but also current pro stats.

In [None]:
with open("..//scrapedData//nfldraft.json", "w", encoding="utf-8") as write_file:
                write_file.write(json.dumps(fx.handle_nflData(years, headers, schoolsList)))

## Clear DB
> Useful for a clean start.  This removes all of the records for this dataset from the following structures: SourcedPlayers, RecordLinks.  All of the Views auto-cleanse themselves.

In [None]:
fx.clearDB(dataset)

## Save to DB

In [None]:
fx.toDB_NFLDraft()

## Strict Matching
> This saves it to RecordLinking where ID == ID, but returns IDYR as the matching target

In [None]:
fx.literalLinking(dataset)

## Fuzzy Matching w/ Threshold

> This is automatically pushing fuzzy matches above a certain threshold into the DB without the need for review [last part isn't true!  not automatically writing currently]

In [3]:
fuzzyDF = fx.doFuzzyMatching(dataset, 'Sports247')
fuzzyDF

Unnamed: 0,sourceID,targetID,ID,PlayerName,Year,sum
"(aaronlynch_southflorida, marlonmack_southflorida_2014)",aaronlynch_southflorida,marlonmack_southflorida_2014,0.739130,0.400000,0.25,0.463043
"(aaronmerz_california, aaronrodgers_california_2003)",aaronmerz_california,aaronrodgers_california_2003,0.782609,0.583333,0.00,0.455314
"(adamjennings_fresnostate, andyjennings_fresnostate_2009)",adamjennings_fresnostate,andyjennings_fresnostate_2009,0.875000,0.750000,0.00,0.541667
"(adamkieft_centralmichigan, adamantonides_centralmichigan_2005)",adamkieft_centralmichigan,adamantonides_centralmichigan_2005,0.724138,0.384615,0.25,0.452918
"(adamseward_unlv, adammeyer_unlv_2018)",adamseward_unlv,adammeyer_unlv_2018,0.733333,0.600000,0.00,0.444444
...,...,...,...,...,...,...
"(williebeavers_westernmichigan, williepope_westernmichigan_2002)",williebeavers_westernmichigan,williepope_westernmichigan_2002,0.793103,0.538462,0.00,0.443855
"(williereid_floridastate, willsecord_floridastate_2009)",williereid_floridastate,willsecord_floridastate_2009,0.826087,0.600000,0.00,0.475362
"(willsvitek_stanford, willpowers_stanford_2005)",willsvitek_stanford,willpowers_stanford_2005,0.684211,0.400000,0.25,0.444737
"(xaviercrawford_centralmichigan, joshuacrawford_centralmichigan_2019)",xaviercrawford_centralmichigan,joshuacrawford_centralmichigan_2019,0.800000,0.571429,0.25,0.540476


## Create the Annotation File

> This changes the dataframe into a MultiIndex data frame that the annotation function requires

In [None]:
conn = sql.connect(cc.databaseName) 
          
sql_query = pd.read_sql_query ('''
                               SELECT
                               *
                               FROM SourcedPlayers
                               WHERE KeyDataSet = 1
                               ''', conn)

df_247 = pd.DataFrame(sql_query, columns = ['IDYR', 'College', 'Year', 'Position'])
df_247.set_index('IDYR', append=False, inplace=True)
sql_query = pd.read_sql_query ('''
                               SELECT
                               *
                               FROM UnlinkedNFL
                               ''', conn)

df_NFL = pd.DataFrame(sql_query, columns = ['ID', 'College'])
df_NFL.set_index('ID', append=False, inplace=True)

fuzzyMI = pd.MultiIndex.from_frame(fuzzyDF)
recordlinkage.write_annotation_file(
    "../Annotations/Annotations/annotation_nfl.json",
    fuzzyMI[0:300],
    df_NFL,
    df_247,
    dataset_a_name="NFL",
    dataset_b_name="Master"
)

## Read in the Annotation File
> Take the resulting Annotation file after handling the processing and insert it into the right table

In [None]:
annotation = recordlinkage.read_annotation_file("..//Annotations//Results//nfl_annotations.json")
try:
    annotation_dict = (annotation.links).to_flat_index()
except Exception as e:
    print(e)

## Insert Annotations to RecordLinks

In [None]:
for record in annotation_dict:
    #MAKE SURE YOU UPDATE THE THIRD VALUE TO THE CORRECT KEYDATASET!!
    Values = [record[0], record[1], 3, 1, 1]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence)
        VALUES (?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()