# KNN Imputation: Terry Stops Dataset

Now we're going to work on seeing how well KNN Imputation works on a dataset that is composed mostly of categorical data.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functions import cat_codes

In [2]:
t_stops = pd.read_csv('datasets/terry_stops/eda_clean.csv')
t_stops = t_stops.drop('Unnamed: 0', axis=1)
t_stops.head()

Unnamed: 0,subject_age_group,subject_id,go_sc_num,terry_stop_id,stop_resolution,weapon_type,officer_id,officer_gender,officer_race,subject_perceived_race,...,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,Unknown,unassigned,20150000110760,33273,Offense Report,,6355,F,White,White,...,45,n,y,N,N,Y,0,1,0,0
1,Unknown,unassigned,20150000110760,33274,Offense Report,,6355,F,White,White,...,45,n,y,N,Y,Y,0,1,0,0
2,Unknown,unassigned,20150000120532,35462,Arrest,,7735,M,White,Not Specified,...,24,n,n,Y,Y,Y,0,0,1,0
3,Unknown,unassigned,20150000127841,36993,Offense Report,,7474,F,White,Not Specified,...,46,n,y,Y,Y,Y,0,0,1,0
4,Unknown,unassigned,20150000145811,41041,Arrest,,4835,M,White,Not Specified,...,56,n,n,Y,N,Y,0,0,1,0


# Preprocessing
Need to drop unusable columns such as unique IDs.

In [3]:
t_stops = t_stops.drop(['subject_id', 'go_sc_num', 'terry_stop_id', 
                        'officer_id'], axis=1)
t_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30364 entries, 0 to 30363
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   subject_age_group         30364 non-null  object
 1   stop_resolution           30364 non-null  object
 2   weapon_type               30364 non-null  object
 3   officer_gender            30364 non-null  object
 4   officer_race              30364 non-null  object
 5   subject_perceived_race    30364 non-null  object
 6   subject_perceived_gender  30364 non-null  object
 7   initial_call_type         30364 non-null  object
 8   final_call_type           30364 non-null  object
 9   call_type                 30364 non-null  object
 10  arrest_flag               30364 non-null  object
 11  frisk_flag                30364 non-null  object
 12  precinct                  30364 non-null  object
 13  sector                    30364 non-null  object
 14  beat                  

In [4]:
# Label encoding
cat_codes(t_stops, t_stops.columns)
t_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30364 entries, 0 to 30363
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   subject_age_group         30364 non-null  int8 
 1   stop_resolution           30364 non-null  int8 
 2   weapon_type               30364 non-null  int8 
 3   officer_gender            30364 non-null  int8 
 4   officer_race              30364 non-null  int8 
 5   subject_perceived_race    30364 non-null  int8 
 6   subject_perceived_gender  30364 non-null  int8 
 7   initial_call_type         30364 non-null  int16
 8   final_call_type           30364 non-null  int16
 9   call_type                 30364 non-null  int8 
 10  arrest_flag               30364 non-null  int8 
 11  frisk_flag                30364 non-null  int8 
 12  precinct                  30364 non-null  int8 
 13  sector                    30364 non-null  int8 
 14  beat                      30364 non-nu

In [5]:
# Copy dataset for experiment
stops_exp = t_stops.copy()

In [6]:
# Scaling Data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_stops = pd.DataFrame(scaler.fit_transform(stops_exp), 
                           columns=stops_exp.columns)

# Create scaled copy for future
scaled_copy = scaled_stops.copy()

scaled_stops.head()

Unnamed: 0,subject_age_group,stop_resolution,weapon_type,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,final_call_type,call_type,...,officer_age,field_contact,offense_report,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,1.0,0.75,0.6,0.0,1.0,1.0,0.0,0.290123,0.222222,0.0,...,0.510638,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.75,0.6,0.0,1.0,1.0,0.5,0.290123,0.222222,0.0,...,0.510638,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.6,1.0,1.0,0.714286,1.0,0.845679,0.494949,0.0,...,0.06383,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.75,0.6,0.0,1.0,0.714286,1.0,0.296296,0.222222,0.0,...,0.531915,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.6,1.0,1.0,0.714286,0.5,0.833333,0.469697,0.666667,...,0.744681,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
# Set seed for reproducibility
np.random.seed(17)

#Separating features
feats = scaled_stops.drop('arrest_flag', axis=1)

# Getting list of feature columns
features = list(feats.columns)

# Introduce missing values to the experiment set
for col in scaled_stops[features]:
    # Every cell has a 20% chance of being selected
    # Rows may be selected more than once
    scaled_stops.loc[scaled_stops.sample(frac=0.2, replace=True).index, 
                                                              col] = np.nan
    
scaled_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30364 entries, 0 to 30363
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   subject_age_group         24871 non-null  float64
 1   stop_resolution           24880 non-null  float64
 2   weapon_type               24853 non-null  float64
 3   officer_gender            24844 non-null  float64
 4   officer_race              24862 non-null  float64
 5   subject_perceived_race    24833 non-null  float64
 6   subject_perceived_gender  24866 non-null  float64
 7   initial_call_type         24876 non-null  float64
 8   final_call_type           24868 non-null  float64
 9   call_type                 24888 non-null  float64
 10  arrest_flag               30364 non-null  float64
 11  frisk_flag                24874 non-null  float64
 12  precinct                  24859 non-null  float64
 13  sector                    24833 non-null  float64
 14  beat  

In [8]:
# Total number of feature values 
num_vals = len(scaled_stops.index)*(len(scaled_stops.columns) - 1)
print(f'The dataset (without target) has a total of {num_vals} values')

# Calculate number of NaNs
num_nan = scaled_stops.isna().sum().sum()
print(f'There are {num_nan} NaN values')

# Percent of missing values
percent_nan = (num_nan / num_vals) * 100
print(f'{round(percent_nan, 2)}% of the dataset is missing')

# Calculate number of rows
n_rows = len(scaled_stops.index)

# obtaining indices of rows with NaN values
nan_cols = scaled_stops[features]
nan_cols = nan_cols[nan_cols.isna().any(axis=1)]
nan_rows = len(nan_cols.index)
print(f'There are {nan_rows} rows with missing values')

# Percentage of entries with missing data
total_missing = (nan_rows / n_rows) * 100
print(f'{round(total_missing, 2)}% of the rows contain missing values')

The dataset (without target) has a total of 819828 values
There are 148580 NaN values
18.12% of the dataset is missing
There are 30222 rows with missing values
99.53% of the rows contain missing values
