In [1]:
# Goal: To see whether the different parameters can be used in a predictive manner 
# - Take the 100,000 entries and separate them into training and test samples 
# - Use keras or something to build a Random forest classification model with:
#     Parameters: calculation_method, address line ratio, geocoord distance, confidence, match codes
#     Output: 0 or 1 (match or no match)

In [2]:
# Necessary imports
import pandas as pd
import numpy as np
import pickle

from tqdm import notebook

import sys
PATH = r"""C:\Users\andrew.tan\Jupyter Notebooks\Geocoding_Validation\\"""
sys.path.insert(0, PATH)

from gc_val import GC_Val

val = GC_Val(PATH + "data.csv", PATH + "neighborhoods.pkl")
false_responses = []
with open(PATH + "false_responses.pkl", "rb") as f:
    false_responses = pickle.load(f)
    f.close()

In [3]:
# Load in all of the necessary data
f = open("formatted_data.pkl", "rb")
X = pickle.load(f)
f.close()

In [9]:
false_count = list(X['expected']).count(1)
print("Number of false responses:", false_count)

Number of false responses: 3333


In [30]:
X.loc[X['expected'] == 0].loc[X['distances'] != 0].loc[X['distances'] != -1]

Unnamed: 0,index,ratio,calculation_method,confidence,match_codes,expected,distances
6,6.0,1.000000,0.0,0.0,2.0,0.0,0.627982
25,25.0,1.000000,0.0,0.0,2.0,0.0,0.236718
91,91.0,1.000000,0.0,0.0,2.0,0.0,7.672865
120,120.0,1.000000,0.0,0.0,2.0,0.0,3.063276
132,132.0,1.000000,0.0,0.0,2.0,0.0,3.807812
...,...,...,...,...,...,...,...
99693,99693.0,0.863636,0.0,0.0,2.0,0.0,7.418322
99742,99742.0,0.863636,0.0,0.0,2.0,0.0,7.418322
99829,99829.0,1.000000,0.0,0.0,2.0,0.0,11.344488
99843,99843.0,1.000000,0.0,0.0,2.0,0.0,0.490437


In [49]:
def get_training(df):
    df_new = df.sample(frac=1)
    df_invalid = df_new.loc[df['expected'] == 1]
    df_valid = df_new.loc[df_new['expected'] != 1].loc[df_new['distances'] != 0].loc[df_new['distances'] != -1]
    remaining_length = len(df_invalid) - len(df_valid)
    if len(df_invalid) <= len(df_valid):
        return pd.concat([df_valid.iloc[0 : len(df_invalid)], df_invalid])
    else:
        for entry in df_new:
            if entry not in df_valid:
                df_valid = pd.concat([df_valid, entry])
                if len(df_valid) == len(df_invalid):
                    break
        return pd.concat([df_valid.iloc[0 : len(df_invalid)], df_invalid])
                    

X_train = get_training(X).sample(frac=1)
X_train.head()

Unnamed: 0,index,ratio,calculation_method,confidence,match_codes,expected,distances
70347,70347.0,0.0,0.0,1.0,0.0,1.0,0.0
11536,11536.0,1.0,0.0,1.0,2.0,0.0,8.725244
91754,91754.0,0.0,0.0,1.0,0.0,1.0,0.0
96594,96594.0,0.508475,0.0,0.0,2.0,1.0,0.0
44292,44292.0,0.8,0.0,1.0,2.0,0.0,12.417704


In [20]:
X_train.isnull().sum()

index                 0
ratio                 0
calculation_method    0
confidence            0
match_codes           0
expected              0
distances             0
dtype: int64

In [45]:
# Scaling the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
columns_to_scale = ['ratio', 'distances']
X_train[columns_to_scale] = sc.fit_transform(X_train[columns_to_scale])
X_train


Unnamed: 0,index,ratio,calculation_method,confidence,match_codes,expected,distances
45320,45320.0,-0.088218,0.0,0.0,2.0,1.0,-0.104546
92393,92393.0,0.258651,2.0,1.0,0.0,1.0,-0.104546
62578,62578.0,0.778954,0.0,0.0,2.0,0.0,-0.104546
32992,32992.0,0.291170,2.0,1.0,0.0,1.0,-0.104546
97414,97414.0,0.305951,1.0,0.0,2.0,0.0,-0.104546
...,...,...,...,...,...,...,...
42613,42613.0,0.778954,0.0,0.0,2.0,0.0,-0.104546
41009,41009.0,-1.822562,0.0,1.0,0.0,1.0,-0.104546
90375,90375.0,-0.270781,0.0,0.0,2.0,1.0,-0.104546
70325,70325.0,0.778954,0.0,0.0,2.0,0.0,-0.104546
