# Annotation Files for Manual Linking
>This notebook facilitates writing out non-linked players to an annotation file - then handling the write out to annotation files.  The files can then be loaded here to process: https://j535d165.github.io/recordlinkage-annotator/  Finally the result file can be loaded back into the j_notebook directory where it is picked up here and added to the Record Links database table.

In [16]:
import pandas as pd
import recordlinkage as rl
from recordlinkage.index import Block
import sqlite3 as sql
import functions as fx
import core_constants as cc
from recordlinkage.base import BaseCompareFeature

class CompareNonExactYears(BaseCompareFeature):

    def _compute_vectorized(self, s1, s2):
        """Compare zipcodes.

        If the zipcodes in both records are identical, the similarity
        is 0. If the first two values agree and the last two don't, then
        the similarity is 0.5. Otherwise, the similarity is 0.
        """

        # check if the zipcode are identical (return 1 or 0)
        sim = (s1 == s2).astype(float)

        # check the first 2 numbers of the distinct comparisons
        sim[(sim == 0) & (s2 >= s1 & s2 <= s1 + 5)] = 0.5

        return sim

## Load Datasets
> Load the source and target datasets.  Currently it is only written for 247 and Rivals but needs to be extended

In [10]:
keydatasets = [1, 3]
dataset_names = []
for ds in keydatasets:
    SQL = ''' SELECT * FROM DataSet'''
    datasets = (fx.connDBAndReturnDF(SQL)).to_dict('records')
    dataset = next(item for item in datasets if item["KeyDataSet"] == ds)
    SQL = """SELECT * FROM {}""".format(dataset['UnlinkedView'])
    
    if (ds == 1 or ds == 2):
        indexDs = 'IDYR'
    else:
        indexDs = 'ID'
    vars()[dataset['DataSet']] = (fx.connDBAndReturnDF(SQL)).set_index(indexDs)
    (vars()[dataset['DataSet']]).index.name = dataset['DataSet'] + indexDs
    (vars()[dataset['DataSet']])[indexDs] = (vars()[dataset['DataSet']]).index.get_level_values(0)
    dataset_names.append(vars() [dataset['DataSet']])

In [11]:
SQL = ''' SELECT TargetFieldName FROM Features where KeyMasterDataSet = {} and KeyTargetDataSet = {} and MatchType = "block"'''.format(keydatasets[0], keydatasets[1])
blockersRaw = (fx.connDBAndReturnDF(SQL)).to_dict('records')
blockers = []
for i in blockersRaw:
    for key,value in i.items():
        blockers.append(value)

indexer = rl.BlockIndex(on = blockers)
candidate_links = indexer.index(dataset_names[0], dataset_names[1])

In [15]:
dataset_names[0].dtypes

ID             object
PlayerName     object
Year            int64
College        object
HighSchool     object
City           object
State          object
Position       object
Height        float64
Weight        float64
IDYR           object
dtype: object

In [17]:
c = rl.Compare()

SQL = '''SELECT TargetFieldName, MatchType, Method FROM Features where KeyMasterDataSet = {} and KeyTargetDataSet = {} and MatchType <> "block"'''.format(keydatasets[0], keydatasets[1])
featureFields = (fx.connDBAndReturnDF(SQL).to_dict('records'))

for feature in featureFields:
    
    if (feature['MatchType'] == 'exact'):
        c.exact(feature['TargetFieldName'], feature['TargetFieldName'], label = feature['TargetFieldName'])
    elif(feature['MatchType'] == 'string' and feature['Method'] is None):
        c.string(feature['TargetFieldName'], feature['TargetFieldName'], label = feature['TargetFieldName'])
    elif(feature['MatchType'] == 'string' and feature['Method'] is not None):
        c.string(feature['TargetFieldName'], feature['TargetFieldName'], method = feature['Method'], label = feature['TargetFieldName'])
    elif(feature['MatchType'] == 'custom_year' and feature['Method'] is None):
        c.add(CompareNonExactYears(feature['TargetFieldName'], feature['TargetFieldName'], label = feature['TargetFieldName']))
try:
    features = c.compute(candidate_links, dataset_names[0], dataset_names[1])
except KeyError as e:
    print(e)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [8]:
sum = 0
for column in features:
    sum = sum + features[column]

print(len(featureFields))
features['Sum'] = sum/len(featureFields)

features.to_csv('test_features.csv')

NameError: name 'features' is not defined

## Create Blockers
> In this case, we are creating blockers on college and year to limit the number of possible matches.

#### !!!This should be extended to take into account fuzzy matching confidence level as well!!!

In [7]:
indexer = rl.BlockIndex(on=['College', 'Year'])
candidate_links = indexer.index(sports247, rivals)

## Create Annotation file
> This is actually terrible code.  Currently I'm manually updating by length.  Need to loop through.  TODO

In [8]:
rl.write_annotation_file(
    "..//Annotations//Annotations//annotate_rivals_14.json",
    candidate_links[7001:7500],
    sports247,
    rivals,
    dataset_a_name="247 Sports",
    dataset_b_name="Rivals")

## Read in Result
> Once you handled the above file in the annotator, the below code will verify that the result file is correctly located in the j_notebooks folder and will read it into a flat index - which makes it easier to insert into the db.

In [9]:
annotation = rl.read_annotation_file("..//Annotations//Results//result_4.json")
try:
    annotation_dict = (annotation.links).to_flat_index()
except Exception as e:
    print(e)

## Insert into the DB

In [10]:
for record in annotation_dict:
    Values = [record[0], record[1], 2, 1, 1]
    query = '''INSERT INTO RecordLinks(MasterID, TargetID, KeyDataSet, KeyLinkType, LinkConfidence)
        VALUES (?,?,?,?,?)'''
    
    conn = sql.connect(cc.databaseName)
    c = conn.cursor()
    
    c.execute(query, Values)
    conn.commit()
    
conn.close()