# KNN Imputation: Terry Stops Dataset

Now we're going to work on seeing how well KNN Imputation works on a dataset that is composed mostly of categorical data.

In [18]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functions import cat_codes

In [19]:
t_stops = pd.read_csv('datasets/terry_stops/eda_clean.csv')
t_stops = t_stops.drop('Unnamed: 0', axis=1)
t_stops.head()

Unnamed: 0,subject_age_group,subject_id,go_sc_num,terry_stop_id,stop_resolution,weapon_type,officer_id,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,final_call_type,call_type,arrest_flag,frisk_flag,precinct,sector,beat,repeat_offenders,incident_year,incident_month,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,Unknown,unassigned,20150000110760,33273,Offense Report,,6355,F,White,White,F,DIST - IP/JO - DV DIST - NO ASLT,"--DV - ARGUMENTS, DISTURBANCE (NO ARREST)",911,N,N,East,E,E1,N,2015,4,45,n,y,N,N,Y,0,1,0,0
1,Unknown,unassigned,20150000110760,33274,Offense Report,,6355,F,White,White,M,DIST - IP/JO - DV DIST - NO ASLT,"--DV - ARGUMENTS, DISTURBANCE (NO ARREST)",911,N,N,East,E,E1,N,2015,4,45,n,y,N,Y,Y,0,1,0,0
2,Unknown,unassigned,20150000120532,35462,Arrest,,7735,M,White,Not Specified,Unable to Determine,THEFT (DOES NOT INCLUDE SHOPLIFT OR SVCS),--THEFT - CAR PROWL,911,N,Y,East,C,C2,N,2015,4,24,n,n,Y,Y,Y,0,0,1,0
3,Unknown,unassigned,20150000127841,36993,Offense Report,,7474,F,White,Not Specified,Unable to Determine,"DISTURBANCE, MISCELLANEOUS/OTHER","--DV - ARGUMENTS, DISTURBANCE (NO ARREST)",911,N,N,East,C,C3,N,2015,4,46,n,y,Y,Y,Y,0,0,1,0
4,Unknown,unassigned,20150000145811,41041,Arrest,,4835,M,White,Not Specified,M,"SUSPICIOUS PERSON, VEHICLE OR INCIDENT",--SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON,ONVIEW,N,N,East,E,E1,N,2015,5,56,n,n,Y,N,Y,0,0,1,0


# Preprocessing
Need to drop unusable columns such as unique IDs.

In [20]:
t_stops = t_stops.drop(['subject_id', 'go_sc_num', 'terry_stop_id', 
                        'officer_id'], axis=1)
t_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30364 entries, 0 to 30363
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   subject_age_group         30364 non-null  object
 1   stop_resolution           30364 non-null  object
 2   weapon_type               30364 non-null  object
 3   officer_gender            30364 non-null  object
 4   officer_race              30364 non-null  object
 5   subject_perceived_race    30364 non-null  object
 6   subject_perceived_gender  30364 non-null  object
 7   initial_call_type         30364 non-null  object
 8   final_call_type           30364 non-null  object
 9   call_type                 30364 non-null  object
 10  arrest_flag               30364 non-null  object
 11  frisk_flag                30364 non-null  object
 12  precinct                  30364 non-null  object
 13  sector                    30364 non-null  object
 14  beat                  

In [21]:
# Label encoding
cat_codes(t_stops, t_stops.columns)
t_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30364 entries, 0 to 30363
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   subject_age_group         30364 non-null  int8 
 1   stop_resolution           30364 non-null  int8 
 2   weapon_type               30364 non-null  int8 
 3   officer_gender            30364 non-null  int8 
 4   officer_race              30364 non-null  int8 
 5   subject_perceived_race    30364 non-null  int8 
 6   subject_perceived_gender  30364 non-null  int8 
 7   initial_call_type         30364 non-null  int16
 8   final_call_type           30364 non-null  int16
 9   call_type                 30364 non-null  int8 
 10  arrest_flag               30364 non-null  int8 
 11  frisk_flag                30364 non-null  int8 
 12  precinct                  30364 non-null  int8 
 13  sector                    30364 non-null  int8 
 14  beat                      30364 non-nu

In [22]:
# Copy dataset for experiment
stops_exp = t_stops.copy()

In [23]:
# Scaling Data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_stops = pd.DataFrame(scaler.fit_transform(stops_exp), 
                           columns=stops_exp.columns)

# Create scaled copy for future
scaled_copy = scaled_stops.copy()

scaled_stops.head()

Unnamed: 0,subject_age_group,stop_resolution,weapon_type,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,final_call_type,call_type,arrest_flag,frisk_flag,precinct,sector,beat,repeat_offenders,incident_year,incident_month,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,1.0,0.75,0.6,0.0,1.0,1.0,0.0,0.290123,0.222222,0.0,0.0,0.0,0.0,0.205882,0.186275,0.0,0.0,0.272727,0.510638,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.75,0.6,0.0,1.0,1.0,0.5,0.290123,0.222222,0.0,0.0,0.0,0.0,0.205882,0.186275,0.0,0.0,0.272727,0.510638,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.6,1.0,1.0,0.714286,1.0,0.845679,0.494949,0.0,0.0,1.0,0.0,0.088235,0.088235,0.0,0.0,0.272727,0.06383,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.75,0.6,0.0,1.0,0.714286,1.0,0.296296,0.222222,0.0,0.0,0.0,0.0,0.088235,0.107843,0.0,0.0,0.272727,0.531915,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.6,1.0,1.0,0.714286,0.5,0.833333,0.469697,0.666667,0.0,0.0,0.0,0.205882,0.186275,0.0,0.0,0.363636,0.744681,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [24]:
# Set seed for reproducibility
np.random.seed(17)

#Separating features
feats = scaled_stops.drop('arrest_flag', axis=1)

# Getting list of feature columns
features = list(feats.columns)

# Introduce missing values to the experiment set
for col in scaled_stops[features]:
    # Every cell has a 20% chance of being selected
    # Rows may be selected more than once
    scaled_stops.loc[scaled_stops.sample(frac=0.2, replace=True).index, 
                                                              col] = np.nan
    
scaled_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30364 entries, 0 to 30363
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   subject_age_group         24871 non-null  float64
 1   stop_resolution           24880 non-null  float64
 2   weapon_type               24853 non-null  float64
 3   officer_gender            24844 non-null  float64
 4   officer_race              24862 non-null  float64
 5   subject_perceived_race    24833 non-null  float64
 6   subject_perceived_gender  24866 non-null  float64
 7   initial_call_type         24876 non-null  float64
 8   final_call_type           24868 non-null  float64
 9   call_type                 24888 non-null  float64
 10  arrest_flag               30364 non-null  float64
 11  frisk_flag                24874 non-null  float64
 12  precinct                  24859 non-null  float64
 13  sector                    24833 non-null  float64
 14  beat  

In [25]:
# Total number of feature values 
num_vals = len(scaled_stops.index)*(len(scaled_stops.columns) - 1)
print(f'The dataset (without target) has a total of {num_vals} values')

# Calculate number of NaNs
num_nan = scaled_stops.isna().sum().sum()
print(f'There are {num_nan} NaN values')

# Percent of missing values
percent_nan = (num_nan / num_vals) * 100
print(f'{round(percent_nan, 2)}% of the dataset is missing')

# Calculate number of rows
n_rows = len(scaled_stops.index)

# obtaining indices of rows with NaN values
nan_cols = scaled_stops[features]
nan_cols = nan_cols[nan_cols.isna().any(axis=1)]
nan_rows = len(nan_cols.index)
print(f'There are {nan_rows} rows with missing values')

# Percentage of entries with missing data
total_missing = (nan_rows / n_rows) * 100
print(f'{round(total_missing, 2)}% of the rows contain missing values')

The dataset (without target) has a total of 819828 values
There are 148580 NaN values
18.12% of the dataset is missing
There are 30222 rows with missing values
99.53% of the rows contain missing values


In [26]:
# Creating list of indices 
null_idx = list(nan_cols.index)

# Creating Answer Key
answer_key = t_stops.iloc[null_idx]
answer_key.head()

Unnamed: 0,subject_age_group,stop_resolution,weapon_type,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,final_call_type,call_type,arrest_flag,frisk_flag,precinct,sector,beat,repeat_offenders,incident_year,incident_month,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,6,3,3,0,7,7,0,47,44,0,0,0,0,7,19,0,0,3,24,0,1,0,0,0,0,1,0,0
1,6,3,3,0,7,7,1,47,44,0,0,0,0,7,19,0,0,3,24,0,1,0,1,0,0,1,0,0
2,6,0,3,1,7,5,2,137,98,0,0,1,0,3,9,0,0,3,3,0,0,1,1,0,0,0,1,0
3,6,3,3,0,7,5,2,48,44,0,0,0,0,3,11,0,0,3,25,0,1,1,1,0,0,0,1,0
4,6,0,3,1,7,5,1,135,93,2,0,0,0,7,19,0,0,4,35,0,0,1,0,0,0,0,1,0


KNN Imputation

In [27]:
from sklearn.impute import KNNImputer

impute = KNNImputer(n_neighbors = 5)

# Applying to dataframe
knn_stops = pd.DataFrame(impute.fit_transform(scaled_stops), 
                           columns=scaled_stops.columns)

knn_stops

Unnamed: 0,subject_age_group,stop_resolution,weapon_type,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,final_call_type,call_type,arrest_flag,frisk_flag,precinct,sector,beat,repeat_offenders,incident_year,incident_month,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,1.000000,0.75,0.6,0.0,1.000000,1.000000,0.0,0.290123,0.222222,0.133333,0.0,0.0,0.000000,0.205882,0.186275,0.0,0.00,0.272727,0.510638,0.0,1.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0
1,1.000000,0.75,0.6,0.0,1.000000,1.000000,0.3,0.290123,0.222222,0.000000,0.0,0.0,0.000000,0.135294,0.186275,0.0,0.00,0.272727,0.510638,0.0,1.0,0.0,0.4,0.0,0.0,0.8,0.0,0.0
2,0.333333,0.15,0.6,1.0,1.000000,0.714286,1.0,0.845679,0.313131,0.400000,0.0,1.0,0.000000,0.088235,0.088235,0.0,0.00,0.709091,0.063830,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.000000,0.60,0.6,0.0,1.000000,0.428571,1.0,0.296296,0.389899,0.000000,0.0,0.0,0.000000,0.088235,0.107843,0.0,0.00,0.272727,0.531915,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.433333,0.00,0.6,1.0,1.000000,0.714286,0.5,0.875309,0.469697,0.666667,0.0,0.4,0.000000,0.205882,0.186275,0.0,0.00,0.363636,0.744681,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30359,0.833333,0.00,0.2,1.0,0.000000,0.600000,0.5,0.519753,0.060606,0.000000,1.0,0.8,0.000000,0.294118,0.364706,1.0,1.00,0.545455,0.106383,0.0,0.0,1.0,0.2,0.0,0.0,0.0,0.0,1.0
30360,0.833333,0.45,0.4,1.0,0.428571,0.142857,0.5,0.345679,0.191919,0.000000,0.0,1.0,0.466667,0.411765,0.450980,1.0,0.56,0.636364,0.340426,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
30361,0.833333,0.65,1.0,1.0,1.000000,0.000000,0.5,0.428395,0.171717,0.000000,0.0,0.0,0.833333,0.411765,0.450980,1.0,0.84,0.636364,0.276596,0.6,0.2,1.0,0.0,0.0,0.0,1.0,0.2,0.0
30362,0.833333,0.00,1.0,1.0,1.000000,1.000000,0.5,0.104938,0.070707,0.000000,1.0,0.0,0.000000,0.058824,0.078431,1.0,1.00,0.636364,0.255319,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.4,0.0


In [28]:
# Inverting Scaling
inverse_knn_stops = pd.DataFrame(scaler.inverse_transform(knn_stops), 
                           columns=knn_stops.columns)
inverse_knn_stops.head()

Unnamed: 0,subject_age_group,stop_resolution,weapon_type,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,final_call_type,call_type,arrest_flag,frisk_flag,precinct,sector,beat,repeat_offenders,incident_year,incident_month,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,6.0,3.0,3.0,0.0,7.0,7.0,0.0,47.0,44.0,0.4,0.0,0.0,0.0,7.0,19.0,0.0,0.0,3.0,24.0,0.0,1.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0
1,6.0,3.0,3.0,0.0,7.0,7.0,0.6,47.0,44.0,0.0,0.0,0.0,0.0,4.6,19.0,0.0,0.0,3.0,24.0,0.0,1.0,0.0,0.4,0.0,0.0,0.8,0.0,0.0
2,2.0,0.6,3.0,1.0,7.0,5.0,2.0,137.0,62.0,1.2,0.0,1.0,0.0,3.0,9.0,0.0,0.0,7.8,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,6.0,2.4,3.0,0.0,7.0,3.0,2.0,48.0,77.2,0.0,0.0,0.0,0.0,3.0,11.0,0.0,0.0,3.0,25.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,2.6,0.0,3.0,1.0,7.0,5.0,1.0,141.8,93.0,2.0,0.0,0.4,0.0,7.0,19.0,0.0,0.0,4.0,35.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Since this is categorical data, we can't have any float values.  We'll need to round all of the data to the nearest whole value to account for this.

In [29]:
for col in inverse_knn_stops[features]:
  for i in range(len(inverse_knn_stops[col])):
    inverse_knn_stops[col][i] = round(inverse_knn_stops[col][i], 0)

inverse_knn_stops.head()

Unnamed: 0,subject_age_group,stop_resolution,weapon_type,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,final_call_type,call_type,arrest_flag,frisk_flag,precinct,sector,beat,repeat_offenders,incident_year,incident_month,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,6.0,3.0,3.0,0.0,7.0,7.0,0.0,47.0,44.0,0.0,0.0,0.0,0.0,7.0,19.0,0.0,0.0,3.0,24.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,6.0,3.0,3.0,0.0,7.0,7.0,1.0,47.0,44.0,0.0,0.0,0.0,0.0,5.0,19.0,0.0,0.0,3.0,24.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2.0,1.0,3.0,1.0,7.0,5.0,2.0,137.0,62.0,1.0,0.0,1.0,0.0,3.0,9.0,0.0,0.0,8.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,6.0,2.0,3.0,0.0,7.0,3.0,2.0,48.0,77.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0,0.0,3.0,25.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,3.0,0.0,3.0,1.0,7.0,5.0,1.0,142.0,93.0,2.0,0.0,0.0,0.0,7.0,19.0,0.0,0.0,4.0,35.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Now to subset this and compare to our answer key

In [30]:
# Subsetting data to match that of our answer key
test_stops = inverse_knn_stops.iloc[null_idx]
test_stops.head()

Unnamed: 0,subject_age_group,stop_resolution,weapon_type,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,final_call_type,call_type,arrest_flag,frisk_flag,precinct,sector,beat,repeat_offenders,incident_year,incident_month,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,6.0,3.0,3.0,0.0,7.0,7.0,0.0,47.0,44.0,0.0,0.0,0.0,0.0,7.0,19.0,0.0,0.0,3.0,24.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,6.0,3.0,3.0,0.0,7.0,7.0,1.0,47.0,44.0,0.0,0.0,0.0,0.0,5.0,19.0,0.0,0.0,3.0,24.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2.0,1.0,3.0,1.0,7.0,5.0,2.0,137.0,62.0,1.0,0.0,1.0,0.0,3.0,9.0,0.0,0.0,8.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,6.0,2.0,3.0,0.0,7.0,3.0,2.0,48.0,77.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0,0.0,3.0,25.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,3.0,0.0,3.0,1.0,7.0,5.0,1.0,142.0,93.0,2.0,0.0,0.0,0.0,7.0,19.0,0.0,0.0,4.0,35.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [31]:
# Resetting indexes of test_iris and answer_key for iteration
test_stops = test_stops.reset_index()
test_stops.drop(['index', 'arrest_flag'], axis=1, inplace=True)
answer_key = answer_key.reset_index()
answer_key.drop(['index', 'arrest_flag'], axis=1, inplace=True)

# Calculate results
results = pd.DataFrame((answer_key - test_stops))

results.head()

Unnamed: 0,subject_age_group,stop_resolution,weapon_type,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,final_call_type,call_type,frisk_flag,precinct,sector,beat,repeat_offenders,incident_year,incident_month,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,4.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,-33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


To differentiate the perfect imputes from the data that was not imputed, we'll count the number of non-zero values and subtract that number from the total number of imputes.

In [32]:
# Imputes where y - y_hat != 0
errors = 0

for col in results.columns:
    for i in range(len(results)):
        if results[col][i] != 0.00 or results[col][i] != -0.00:
            errors += 1



# Imputes where y - y_hat == 0
perfect_imputes = num_nan - errors

print(f'Total Values Imputed: {num_nan}')
print(f'Number of Errors: {errors}')
print(f'Perfect Imputations: {perfect_imputes}')

Total Values Imputed: 148580
Number of Errors: 66576
Perfect Imputations: 82004
