# Source: NCAA

In [7]:
#hide
import core_constants as cc
import functions as fx
import json
import pandas as pd
import sqlite3 as sql
import recordlinkage

## Set Notebook Settings

In [2]:
#Load NCAA Site Schema
schoolsList = json.loads(open('..//config//schools.json', "r").read())
ncaaDates = json.loads(open('..//config//ncaa_dates.json', "r").read())
conferences = cc.get_availableConferences()

In [3]:
years = cc.get_defYears()
headers= cc.get_header()
dataset = 'NCAA'

## Get & Save the Teams Page HTML
#### Source: https://stats.ncaa.org/team/392/roster/15280
> This page contains a roster for any given team in any year (using their own ids)

In [None]:
fx.get_NCAA(schoolsList, ncaaDates)

## Clear DB
> Useful for a clean start. This removes all of the records for this dataset from the following structures: SourcedPlayers, RecordLinks. All of the Views auto-cleanse themselves.

In [None]:
fx.clearDB(dataset)

## Process Local NCAA HTML Files

> All of this processing is done locally, using the files saved in the previous few steps.  This creates an exhaustive store of all the fields grabbed from the scrapes.

In [None]:
cc.save_records('scrapedData', 'ncaa', fx.process_NCAA(conferences))

## Save to DB

In [None]:
fx.toDB_NCAA()

## Strict Matching
> This saves it to RecordLinking where ID == ID, but returns IDYR as the matching target

In [4]:
fx.literalLinking(dataset)

Connected to SQLite


## Fuzzy Matching w/ Threshold

> This is automatically pushing fuzzy matches above a certain threshold into the DB without the need for review [last part isn't true!  not automatically writing currently]

In [5]:
fuzzyDF = fx.doFuzzyMatching(dataset, 'Sports247')

## Create the Annotation File

> This changes the dataframe into a MultiIndex data frame that the annotation function requires

In [None]:
conn = sql.connect(cc.databaseName) 
          
sql_query = pd.read_sql_query ('''
                               SELECT
                               *
                               FROM SourcedPlayers
                               WHERE KeyDataSet = 1
                               ''', conn)

df_247 = pd.DataFrame(sql_query, columns = ['IDYR', 'College', 'Year', 'Position'])
df_247.set_index('IDYR', append=False, inplace=True)
sql_query = pd.read_sql_query ('''
                               SELECT
                               *
                               FROM UnlinkedNCAA
                               ''', conn)

df_NCAA = pd.DataFrame(sql_query, columns = ['ID', 'College'])
df_NCAA.set_index('ID', append=False, inplace=True)

fuzzyMI = pd.MultiIndex.from_frame(fuzzyDF)
recordlinkage.write_annotation_file(
    "../Annotations/Annotations/annotation_ncaa2.json",
    fuzzyMI[0:300],
    df_NCAA,
    df_247,
    dataset_a_name="NCAA",
    dataset_b_name="Master"
)

## Read in the Annotation File
> Take the resulting Annotation file after handling the processing and insert it into the right table

In [8]:
annotation = recordlinkage.read_annotation_file("..//Annotations//Results//ncaa_results.json")
try:
    annotation_dict = (annotation.links).to_flat_index()
except Exception as e:
    print(e)

## Insert Annotations to RecordLinks

In [9]:
for record in annotation_dict:
    #MAKE SURE YOU UPDATE THE THIRD VALUE TO THE CORRECT KEYDATASET!!
    Values = [record[0], record[1], 5, 1, 1]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence)
        VALUES (?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()