# Source: Rivals

In [None]:
#hide
import core_constants as cc
import functions as fx
import json
import pandas as pd
import sqlite3 as sql
import os
import recordlinkage

## Set Notebook Settings

In [None]:
conference = 'sunbelt'

years = cc.get_defYears()
headers= cc.get_header()
schoolsList = cc.get_schoolsList()
teamDirectory = cc.get_htmlDir('rivals', conference, 'teams')
playerDirectory = cc.get_htmlDir('rivals', conference, 'recruits')
resultsDirectory = '..//Annotations//Results/Rivals//'
dataset = 'Rivals'

## Get & Save the Teams & Players Page HTML
#### Source: https://maryland.rivals.com/commitments/football/2012
> This page contains metadata of each player along with the Rivals ranking and stars.  Unlike 247Sports, we process the fetch and save of both pages directly from a single function

In [None]:
fx.get_Rivals(conference, years, headers, sleepyTime=6)

## Process Local Rivals HTML Files

> All of this processing is done locally, using the files saved in the previous few steps.  This creates an exhaustive store of all the fields grabbed from the scrapes.

>WARN: There is a different process_rivals scrip tin the functions.py file, and it probably needs to be integrated into process_Rivals before you re-run this.

In [None]:
cc.save_records('scrapedData', 'rivals_' + conference, fx.process_Rivals(playerDirectory, conference))

## Clear DB
> Useful for a clean start.  This removes all of the records for this dataset from the following structures: SourcedPlayers, RecordLinks.  All of the Views auto-cleanse themselves.

In [None]:
fx.clearDB(dataset)

## Save to Database

In [None]:
fx.toDB_Rivals()

## Strict Matching
> This saves it to RecordLinking where ID == ID, but returns IDYR as the matching target

In [None]:
fx.literalLinking(dataset)

## Fuzzy Matching w/ Threshold

> This is automatically pushing fuzzy matches above a certain threshold into the DB without the need for review [last part isn't true!  not automatically writing currently]

In [None]:
fuzzyDF = fx.doFuzzyMatching(dataset, 'Sports247')

## Create the Annotation File

> This changes the dataframe into a MultiIndex data frame that the annotation function requires

> !Important - this needs to be updated to reflect the shape/size of the number of Rivals records to annotate

In [None]:
conn = sql.connect(cc.databaseName) 
          
sql_query = pd.read_sql_query ('''
                               SELECT
                               *
                               FROM SourcedPlayers
                               WHERE KeyDataSet = 1
                               ''', conn)

df_247 = pd.DataFrame(sql_query, columns = ['IDYR', 'College', 'Year', 'Position'])
df_247.set_index('IDYR', append=False, inplace=True)
sql_query = pd.read_sql_query ('''
                               SELECT
                               *
                               FROM UnlinkedRivals
                               ''', conn)

df_rivals = pd.DataFrame(sql_query, columns = ['IDYR', 'College', 'Year', 'Position'])
df_rivals.set_index('IDYR', append=False, inplace=True)

fuzzyMI = pd.MultiIndex.from_frame(fuzzyDF)
recordlinkage.write_annotation_file(
    "../Annotations/Annotations/annotation_RIVALS_92522.json",
    fuzzyMI[0:300],
    df_rivals,
    df_247,
    dataset_a_name="Rivals",
    dataset_b_name="Master"
)

## Read in the Annotation File
> Take the resulting Annotation file after handling the processing and insert it into the right table

In [None]:
for filename in os.listdir(resultsDirectory):
    
    f = os.path.join(resultsDirectory, filename)
    if os.path.isfile(f):
        annotation = recordlinkage.read_annotation_file(f)
        try:
            annotation_dict = {}
            if (annotation.links is not None):
                annotation_dict = (annotation.links).to_flat_index()
                for record in annotation_dict:
                    Values = [record[0], record[1], 2, 1, 1]
                    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence)
            VALUES (?,?,?,?,?)'''

                    conn = sql.connect(cc.databaseName)
                    c = conn.cursor()

                    c.execute(query, Values)
                    conn.commit()

        except Exception as e:
            print(e)
conn.close()

## Insert Annotations to RecordLinks

In [None]:
fuzzylist = fuzzyDF.values.tolist()

In [None]:
fuzzylist[0]

In [None]:
for record in fuzzylist:
    #MAKE SURE YOU UPDATE THE THIRD VALUE TO THE CORRECT KEYDATASET!!
    Values = [record[0], record[1], 2, 1, 1, record[9],0]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, TargetKeyDataSet, KeyLinkType, LinkConfidence, Transfer)
        VALUES (?,?,?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()