In [1]:
# Goal: To see whether the different parameters can be used in a predictive manner 
# - Take the 100,000 entries and separate them into training and test samples 
# - Use keras or something to build a Random forest classification model with:
#     Parameters: calculation_method, address line ratio, geocoord distance, confidence, match codes
#     Output: 0 or 1 (match or no match)

In [2]:
# Necessary imports
import pandas as pd
import numpy as np
import pickle

from tqdm import notebook

import sys
PATH = r"""C:\Users\andrew.tan\Jupyter Notebooks\Geocoding_Validation\\"""
sys.path.insert(0, PATH)

from gc_val import GC_Val

val = GC_Val(PATH + "data.csv", PATH + "neighborhoods.pkl")
false_responses = []
with open(PATH + "false_responses.pkl", "rb") as f:
    false_responses = pickle.load(f)
    f.close()

In [39]:
# Retrieve the desired features from data
X = pd.DataFrame(columns=['index', 'ratio', 'calculation_method', 'conf', 'match_codes', 'expected'])
y = pd.DataFrame(columns=['validity'])

for i in notebook.tqdm(range(len(val.df))):
    # Ratio
    try:
        ratio = val.get_ratio(i)
    except KeyError as e:
        ratio = 0
    
    # Calculation_method
    # ['Rooftop', 'InterpolationOffset', 'Interpolation'] = [0, 1, 2]
    method = val.get_calculation_method(i)
    if method == 'Rooftop': method = 0
    elif method == 'InterpolationOffset': method = 1
    elif method == 'Interpolation': method = 2
    else: method = None
    
    # Coord dist
#     dist = val.get_distance(val.get_addresses(i)[0], val.get_addresses(i)[1])
    
    # Confidence
    # ['High', 'Medium', 'Low'] = [0, 1, 2]
    conf = val.get_confidence(i)
    if conf == 'High': conf = 0
    elif conf == 'Medium': conf = 1
    elif conf == 'Low': conf = 2
    else: conf = None
    
    # Match_codes
    # [['UpHierarchy'], ['Ambiguous', 'UpHierarchy'], ['Good'], ['Ambiguous']]= [0, 1, 2, 3]
    code = str(val.get_match_codes(i))
    if code == "['UpHierarchy']": code = 0
    elif code == "['Ambiguous', 'UpHierarchy']": code = 1
    elif code == "['Good']": code = 2
    elif code == "['Ambiguous']": code = 3
    else: code = None
    
    expected = -1
    if i in false_responses:  # 0 for valid and 1 for false entries
        expected = 1
    else:
        expected = 0
        
    insert = [i, ratio, method, conf, code, expected]
    X.loc[i] = insert
    
    
    

  0%|          | 0/99993 [00:00<?, ?it/s]

In [None]:
# Add in the geocoord distance as a feature
for i in range(len(df.val)):
    given, resp = val.get_address(i)
    dist = val.get_distance(given, resp)

In [25]:
# Save the training data into a pkl file
file = open("formatted_data.pkl", "wb")
pickle.dump(X, file)
file.close()

In [26]:
# Load the training data
file = open("formatted_data.pkl", "rb")
X = pickle.load(file)
file.close()
X.head()

Unnamed: 0,index,ratio,calculation_method,conf,match_codes,pred
0,0.0,1.0,0.0,0.0,2.0,0
1,1.0,0.96,0.0,0.0,2.0,0
2,2.0,0.933333,0.0,0.0,2.0,0
3,3.0,0.956522,0.0,0.0,2.0,0
4,4.0,1.0,0.0,0.0,2.0,0
