In [None]:
import json
import core_constants as cc
import functions as fx
import pandas as pd
import sqlite3 as sql
import recordlinkage

In [None]:
SQL = '''SELECT * from Transfers247'''
df_247 = (fx.connDBAndReturnDF(SQL)).set_index('IDYR')
df_247.index.name = '247_IDYR'
df_247

In [None]:
SQL = '''SELECT * from UnlinkedNCAA'''
df_ncaa = (fx.connDBAndReturnDF(SQL)).set_index('ID')
df_ncaa.index.name = 'NCAA_ID'
df_ncaa

In [None]:
from recordlinkage.base import BaseIndexAlgorithm

class BlockNCAA(BaseIndexAlgorithm):
    """Make candidate record pairs that agree on one or more variables.
    Returns all record pairs that agree on the given variable(s). This
    method is known as *blocking*. Blocking is an effective way to make a
    subset of the record space (A * B).
    Parameters
    ----------
    left_on : label, optional
        A column name or a list of column names of dataframe A. These
        columns are used to block on.
    right_on : label, optional
        A column name or a list of column names of dataframe B. These
        columns are used to block on. If 'right_on' is None, the `left_on`
        value is used. Default None.
    **kwargs :
        Additional keyword arguments to pass to
        :class:`recordlinkage.base.BaseIndexAlgorithm`.
    Examples
    --------
    In the following example, the record pairs are made for two historical
    datasets with census data. The datasets are named ``census_data_1980``
    and ``census_data_1990``.
    >>> indexer = recordlinkage.BlockIndex(on='first_name')
    >>> indexer.index(census_data_1980, census_data_1990)
    """

    def __init__(self, left_on=None, right_on=None, **kwargs):
        on = kwargs.pop('on', None)
        super(Block, self).__init__(**kwargs)

        # variables to block on
        self.left_on = left_on
        self.right_on = right_on

        if on is not None:
            warnings.warn(
                "The argument 'on' is deprecated. Use 'left_on=...' and "
                "'right_on=None' to simulate the behaviour of 'on'.",
                DeprecationWarning,
                stacklevel=2)
            self.left_on, self.right_on = on, on

    def __repr__(self):

        class_name = self.__class__.__name__
        left_on, right_on = self._get_left_and_right_on()

        return "<{} left_on={!r}, right_on={!r}>".format(
            class_name, left_on, right_on)

    def _get_left_and_right_on(self):

        if self.right_on is None:
            return (self.left_on, self.left_on)
        else:
            return (self.left_on, self.right_on)

    def _link_index(self, df_a, df_b):

        left_on, right_on = self._get_left_and_right_on()
        left_on = listify(left_on)
        right_on = listify(right_on)

        blocking_keys = ["blocking_key_%d" % i for i, v in enumerate(left_on)]

        # make a dataset for the data on the left
        # 1. make a dataframe
        # 2. rename columns
        # 3. add index col
        # 4. drop na (last step to presever index)
        data_left = pandas.DataFrame(df_a[left_on], copy=False)
        data_left.columns = blocking_keys
        data_left['index_x'] = numpy.arange(len(df_a))
        data_left.dropna(axis=0, how='any', subset=blocking_keys, inplace=True)

        # make a dataset for the data on the right
        data_right = pandas.DataFrame(df_b[right_on], copy=False)
        data_right.columns = blocking_keys
        data_right['index_y'] = numpy.arange(len(df_b))
        data_right.dropna(axis=0,
                          how='any',
                          subset=blocking_keys,
                          inplace=True)

        # merge the dataframes
        pairs_df = data_left.merge(data_right, how='inner', on=blocking_keys)

        return pandas.MultiIndex(
            levels=[df_a.index.values, df_b.index.values],
            codes=[pairs_df['index_x'].values, pairs_df['index_y'].values],
            verify_integrity=False)

In [None]:
from recordlinkage.base import BaseCompareFeature

class Year(BaseCompareFeature):

    def _compute_vectorized(self, s1, s2):
        """Compare years

        If the target year is within 5, but (probably not equal) then return true
        """
        sim = ((s2 == s1) | (s2 == s1[0] + 1) | (s2 == s1[0] + 2 | (s2 == s1[0] + 3 | (s2 == s1[0] + 4)))).astype(float)

        return sim

In [None]:
indexer = recordlinkage.Index()
indexer = recordlinkage.BlockIndex(on=['KeyPositionGroup'])


In [None]:
pairs = indexer.index(df_247, df_ncaa)

In [None]:
sumFields = []
c = recordlinkage.Compare()
c.string('PlayerName', 'PlayerName', method='damerau_levenshtein', label='PlayerName')
sumFields.append('PlayerName')
c.exact('StandardizedPosition', 'StandardizedPosition', label='StandardizedPosition')
sumFields.append('StandardizedPosition')
c.exact('KeyPositionGroup', 'KeyPositionGroup', label='KeyPositionGroup')
sumFields.append('KeyPositionGroup')
c.add(Year('Year', 'Year', label='Year'))
sumFields.append('Year')




In [None]:
features = c.compute(pairs, df_247, df_ncaa)

In [None]:
sum = 0
for field in sumFields:
    sum = sum + features[field]

features['sum'] = sum / len(sumFields)

features = features[features['Year'] == 1.0]

features