In [17]:
from gc_val import GC_Val
import pickle
import pandas as pd

In [23]:
# Create an instance of the validation class
val = GC_Val("data.csv", "neighborhoods.pkl")

# Set some constants
threshold = 0.7
max_dist = 15 #km

In [24]:
# Read in the false responses pickle file
file = open("false_responses.pkl", "rb")
false_resp = pickle.load(file)
file.close()
len(false_resp)

3533

In [26]:
# Separate the data into interested classifiers based on...
# Confidence
high_conf = []
mid_conf = []
low_conf = []

# Reason for falsity 
addrLine_issue = []
admin_district = []
no_bldg = []

# API's calculation method
rooftop = []
interpol = []
interpol_offset = []

# Match codes
f_matches = {}

for ind in false_resp:
    # Getting the interested classifiers
    if (conf:=val.get_confidence(ind)) == 'High':
        high_conf.append(ind)
    elif (conf:=val.get_confidence(ind)) == 'Medium':
        mid_conf.append(ind)
    elif (conf:=val.get_confidence(ind)) == 'Low':
        low_conf.append(ind)
        
    calc_method = val.get_calculation_method(ind)
    # ['Rooftop', 'InterpolationOffset', 'Interpolation']

    if calc_method == 'Rooftop':
        rooftop.append(ind)
    elif calc_method == 'InterpolationOffset':
        interpol_offset.append(ind)
    elif calc_method == 'Interpolation':
        interpol.append(ind)
        
    match_codes = val.get_match_codes(ind)
    if str(match_codes) in f_matches.keys():
        f_matches[str(match_codes)] += 1
    else:
        f_matches[str(match_codes)] = 1
        
    # Finding out the reason for false entry
    if not val.has_addressLine(ind):
        addrLine_issue.append(ind)
        continue
    
    if not val.same_admin_district(ind):
        admin_district.append(ind)
        continue
        
    if not val.has_building_number(ind):
        no_bldg.append(ind)
print("Total faulty entries:", len(false_resp))    
print("High conf:", len(high_conf))
print("Medium conf:", len(mid_conf))
print("Low conf:", len(low_conf))
print()
print("AddressLine issue:", len(addrLine_issue))
print("Admin district issue:", len(admin_district))
print("Missing building number:", len(no_bldg))
print()
print("Rooftop method:", len(rooftop))
print("Interpolation method:", len(interpol))
print("Interpolation_offset method:", len(interpol_offset))
display(f_matches)

Total faulty entries: 3533
High conf: 716
Medium conf: 2514
Low conf: 303

AddressLine issue: 1362
Admin district issue: 372
Missing building number: 1050

Rooftop method: 2160
Interpolation method: 992
Interpolation_offset method: 381


{"['UpHierarchy']": 1978,
 "['Ambiguous', 'UpHierarchy']": 189,
 "['Good']": 1240,
 "['Ambiguous']": 126}

In [21]:
## Counting the entire dataset based on...
# Confidence
high = 0
mid = 0
low = 0

# API's calculation method
r = 0
inter = 0
io = 0

# Match codes
combs = {}
for i in range(len(val.df)):
    try:
        if val.get_confidence(i) == "High":
            high += 1
        elif val.get_confidence(i) == "Medium":
            mid += 1
        elif val.get_confidence(i) == "Low":
            low += 1
        
        method = val.get_calculation_method(i)
        if method == 'Rooftop':
            r += 1
        elif method == 'Interpolation':
            inter += 1
        elif method == 'InterpolationOffset':
            io += 1

        match_codes = val.get_match_codes(i)
        if str(match_codes) in combs.keys():
            combs[str(match_codes)] += 1
        else:
            combs[str(match_codes)] = 1

    except Exception as e:
        continue
print("High confidence:", high)
print("Medium confidence:", mid)
print("Low confidence:", low)
print()
print("Rooftop:", r)
print("Interpolation:", inter)
print("InterpolationOffset:", io)
print()
display(combs)

High confidence: 96016
Medium confidence: 3627
Low confidence: 350

Rooftop: 96613
Interpolation: 992
InterpolationOffset: 2388



{"['Good']": 97066,
 "['UpHierarchy']": 1978,
 "['Ambiguous', 'UpHierarchy']": 189,
 "['Ambiguous']": 760}

In [27]:
# Outputting statistics
digits = 3  # Round to 3 digits

print("Percentage of high-confidence faulty entries: " + str(round(len(high_conf)/high * 100, digits)) + "%")
print("Percentage of medium-confidence faulty entries: " + str(round(len(mid_conf)/mid * 100, digits)) + "%")
print("Percentage of low-confidence faulty entries: " + str(round(len(low_conf)/low * 100, digits)) + "%" )
print("Total percentage of faulty entries: " + str(round(len(false_resp) / len(val.df) * 100, digits)) + "%")
print()
print("Percentage of rooftop faulty entries: " + str(round(len(rooftop)/r * 100, digits)) + "%")
print("Percentage of interpolation faulty entries: " + str(round(len(interpol)/inter * 100, digits)) + "%")
print("Percentage of interpolation_offset faulty entries: " + str(round(len(interpol_offset)/io * 100, digits)) + "%")
print()
for key in f_matches:
    percentage = round(f_matches[key]/combs[key] * 100, digits)
    print(f"Percentage of faulty entries with match_code {key}: {percentage}%")

Percentage of high-confidence faulty entries: 0.746%
Percentage of medium-confidence faulty entries: 69.313%
Percentage of low-confidence faulty entries: 86.571%
Total percentage of faulty entries: 3.533%

Percentage of rooftop faulty entries: 2.236%
Percentage of interpolation faulty entries: 100.0%
Percentage of interpolation_offset faulty entries: 15.955%

Percentage of faulty entries with match_code ['UpHierarchy']: 100.0%
Percentage of faulty entries with match_code ['Ambiguous', 'UpHierarchy']: 100.0%
Percentage of faulty entries with match_code ['Good']: 1.277%
Percentage of faulty entries with match_code ['Ambiguous']: 16.579%


In [29]:
val.main(3523, debug=True)

Index: 3523


{'addressLine': '730 via lugano',
 'countryRegion': 'us',
 'locality': 'winter park',
 'adminDistrict2': 'orange',
 'adminDistrict': 'fl',
 'postalCode': '32789'}

{'adminDistrict2': 'orange county',
 'formattedAddress': '730 via lugano, winter park, fl 32789',
 'countryRegion': 'united states',
 'postalCode': '32789',
 'locality': 'winter park',
 'adminDistrict': 'fl',
 'addressLine': '730 via lugano'}

Returning true because of high confidence


True