In [1]:
!pip install pandas pyarrow



In [2]:
import os
import numpy as np
print(os.getcwd())

import pandas as pd

file_path = 'nc_reduced_data.feather'
data = pd.read_feather(file_path)
print(data.head())

/Users/fasihz/Duke/COMPSCI 526: Data Science
             datetime       file_source county_name_cat  subject_age  \
0 2000-01-11 23:30:00  nc_winston-salem             NaN         49.0   
1 2000-01-12 00:59:00  nc_winston-salem             NaN         21.0   
2 2000-01-12 21:05:00  nc_winston-salem             NaN         22.0   
3 2000-01-12 21:05:00  nc_winston-salem             NaN         21.0   
4 2000-01-12 22:37:00  nc_winston-salem             NaN         19.0   

  subject_sex_cat subject_race_cat raw_ethnicity_cat  \
0          female            black                 N   
1            male            black                 N   
2            male            white                 N   
3          female            black                 N   
4          female            black                 N   

                                department_name_cat outcome_cat  \
1  Winston-Salem State University Police Department    citation   
3  Winston-Salem State University Police Department

In [3]:
print(data.columns)
for column in data.columns:
    unique_values = data[column].unique()
    print(f"columns: {column}")
    print(f"number of unique values: {len(unique_values)}")
    print(f"unique values: {unique_values}")

Index(['datetime', 'file_source', 'county_name_cat', 'subject_age',
       'subject_sex_cat', 'subject_race_cat', 'raw_ethnicity_cat',
       'department_name_cat', 'outcome_cat', 'raw_action_description_cat',
       'contraband_found_cat', 'contraband_drugs_cat',
       'contraband_weapons_cat', 'reason_for_frisk_cat',
       'reason_for_search_cat', 'reason_for_stop_cat'],
      dtype='object')
columns: datetime
number of unique values: 5680426
unique values: <DatetimeArray>
['2000-01-11 23:30:00', '2000-01-12 00:59:00', '2000-01-12 21:05:00',
 '2000-01-12 22:37:00', '2000-01-12 23:05:00', '2000-01-13 23:50:00',
 '2000-01-12 23:30:00', '2000-01-15 18:45:00', '2000-01-30 01:20:00',
 '2000-01-28 08:57:00',
 ...
 '2015-12-08 13:49:00', '2010-12-08 19:59:00', '2015-12-31 04:40:00',
 '2015-10-26 16:27:00', '2015-10-27 15:52:00', '2015-12-30 19:15:00',
 '2015-11-12 13:48:00', '2015-09-28 08:51:00', '2015-12-26 17:01:00',
 '2015-10-30 19:27:00']
Length: 5680426, dtype: datetime64[ns]
column

In [4]:
outcome_counts = data['raw_action_description_cat'].value_counts()
print(outcome_counts)

#This variable is more nuanced, we can see the NaN values in outcome_cat variable are no action taken. 
#We will replace these with no action taken. More analysis below to confirm. 

raw_action_description_cat
Citation Issued    15883010
No Action Taken      744931
On-View Arrest       541396
Name: count, dtype: int64


In [5]:
# This code first finds rows where 'outcome_cat' is NaN, and check if 'raw_action_description_cat' is 'No Action Taken' for all these rows

# Find rows where 'outcome_cat' is NaN
nan_rows = data[data['outcome_cat'].isna()]

# Check if 'raw_action_description_cat' is 'No Action Taken' for all these rows
mismatch_indices = nan_rows[nan_rows['raw_action_description_cat'] != 'No Action Taken'].index

if len(mismatch_indices) > 0:
    print(f"Mismatch found at indices: {mismatch_indices.tolist()}")
else:
    print("All instances match: 'No Action Taken' where 'outcome_cat' is NaN")

All instances match: 'No Action Taken' where 'outcome_cat' is NaN


In [6]:
print(data['subject_race_cat'].unique())

values_to_remove = ['unknown', 'other']
df_clean = data[~data['subject_race_cat'].isin(values_to_remove) & data['subject_race_cat'].notna()]

['black', 'white', 'hispanic', 'unknown', 'asian/pacific islander', 'other', NaN]
Categories (6, object): ['asian/pacific islander', 'black', 'hispanic', 'other', 'unknown', 'white']


In [7]:
df_clean
data = df_clean
data
print(data['subject_race_cat'].isna().sum())

#no missing values in race column.

print(data['subject_sex_cat'].isna().sum())

#no missing values in sex column either. 

print(data['outcome_cat'].isna().sum())

#Around 10000 missing values in outcome, these are No action taken.

0
0
733821


In [8]:
print(data['subject_race_cat'].value_counts())

subject_race_cat
white                     13562220
black                      8323550
hispanic                   1967984
asian/pacific islander      292691
other                            0
unknown                          0
Name: count, dtype: int64


In [9]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# I defined all possible categories upfront for consistency
all_race_categories = ['black', 'white', 'hispanic', 'asian/pacific islander']
all_sex_categories = ['female', 'male', 'unknown']
all_outcome_categories = ['warning', 'citation', 'arrest', 'no_action']

# I converted categorical columns to pd.Categorical with predefined categories
data['subject_race_cat'] = pd.Categorical(data['subject_race_cat'], categories=all_race_categories)
data['subject_sex_cat'] = pd.Categorical(data['subject_sex_cat'], categories=all_sex_categories)
data['outcome_cat'] = pd.Categorical(data['outcome_cat'], categories=all_outcome_categories)

import statsmodels.api as sm

# I also removed rows with NaN values in 'subject_age'**
data = data.dropna(subset=['subject_age'])

# Define numeric and categorical features
numeric_features = ['subject_age']
categorical_features = ['subject_race_cat', 'subject_sex_cat']

# Standardize numeric features
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(data[numeric_features])

# Convert the scaled numeric data to a DataFrame
X_numeric_df = pd.DataFrame(X_numeric_scaled, columns=numeric_features)

# One-hot encode categorical features
onehot = OneHotEncoder(drop='first', sparse_output=True)
X_categorical_encoded = onehot.fit_transform(data[categorical_features])

# Convert the one-hot encoded data to a DataFrame
categorical_col_names = onehot.get_feature_names_out(categorical_features)
# Convert to a sparse DataFrame directly
X_categorical_df = pd.DataFrame.sparse.from_spmatrix(X_categorical_encoded, columns=onehot.get_feature_names_out(categorical_features))

# Combine numeric and categorical features into one DataFrame using pandas.concat()
X_combined = pd.concat([X_numeric_df.reset_index(drop=True), X_categorical_df.reset_index(drop=True)], axis=1)

The asian race category has been set as reference category above. 
Also female has been set as reference category in sex. 

In [10]:
import statsmodels.api as sm

# Create the dependent variable (outcome) as a categorical variable
data['outcome_cat'] = data['outcome_cat'].astype('category')

# Add 'No action taken' to the existing categories of 'outcome_cat'
data['outcome_cat'] = data['outcome_cat'].cat.add_categories('No action taken')

# Replace NaN values in 'outcome_cat' with 'No action taken'
data['outcome_cat'] = data['outcome_cat'].fillna('No action taken')

# Verify the changes
print(data['outcome_cat'].value_counts())

outcome_cat
citation           15550500
No action taken      733722
arrest               534610
no_action                 0
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outcome_cat'] = data['outcome_cat'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outcome_cat'] = data['outcome_cat'].cat.add_categories('No action taken')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outcome_cat'] = data['outcome_cat'].fillna('No action take

In [11]:
# Remove the 'no_action' category from the 'outcome_cat' column
data['outcome_cat'] = data['outcome_cat'].cat.remove_categories('no_action')

# Verify the changes
print(data['outcome_cat'].value_counts())

outcome_cat
citation           15550500
No action taken      733722
arrest               534610
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outcome_cat'] = data['outcome_cat'].cat.remove_categories('no_action')


In [12]:
print(data['subject_age'].describe())  # Check the distribution of subject_age

print(data['outcome_cat'].value_counts())  # Check the distribution of outcome categories

count    2.414407e+07
mean     3.492775e+01
std      1.344198e+01
min      1.000000e+01
25%      2.400000e+01
50%      3.200000e+01
75%      4.400000e+01
max      1.100000e+02
Name: subject_age, dtype: float64
outcome_cat
citation           15550500
No action taken      733722
arrest               534610
Name: count, dtype: int64


In [13]:
# Set 'No action taken' as the reference category
data['outcome_cat'] = data['outcome_cat'].cat.set_categories(['No action taken', 'citation', 'warning', 'arrest'])

# One-hot encode the outcome_cat column, setting 'No action taken' as the reference category
Y = pd.get_dummies(data['outcome_cat'], drop_first=False)

Y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,No action taken,citation,warning,arrest
0,False,False,True,False
1,False,True,False,False
2,False,False,True,False
3,False,True,False,False
4,False,False,True,False
...,...,...,...,...
24607106,False,True,False,False
24607107,False,False,False,True
24607108,False,True,False,False
24607109,False,False,True,False


In [14]:
print(X_combined.nunique())

subject_age                  100
subject_race_cat_black         2
subject_race_cat_hispanic      2
subject_race_cat_white         2
subject_sex_cat_male           2
dtype: int64


In [18]:
correlation_matrix = X_combined.corr()
print(correlation_matrix)

                           subject_age  subject_race_cat_black  \
subject_age                   1.000000               -0.025919   
subject_race_cat_black       -0.025919                1.000000   
subject_race_cat_hispanic    -0.090561               -0.216053   
subject_race_cat_white        0.075633               -0.821007   
subject_sex_cat_male          0.018464               -0.041943   

                           subject_race_cat_hispanic  subject_race_cat_white  \
subject_age                                -0.090561                0.075633   
subject_race_cat_black                     -0.216053               -0.821007   
subject_race_cat_hispanic                   1.000000               -0.337193   
subject_race_cat_white                     -0.337193                1.000000   
subject_sex_cat_male                        0.085673               -0.007811   

                           subject_sex_cat_male  
subject_age                            0.018464  
subject_race_cat_black

**Time to regress!!

In [19]:
print(Y.index.equals(X_combined.index))  # Should print True, but it does not. so need to do aligning. 

Y = Y.reset_index(drop=True)
X_combined = X_combined.reset_index(drop=True)

print(Y.index.equals(X_combined.index))

#Good to go now! 

True
True


In [20]:
X_combined['Male_black'] = X_combined['subject_race_cat_black'] * X_combined['subject_sex_cat_male']
X_combined['Male_white'] = X_combined['subject_race_cat_white'] * X_combined['subject_sex_cat_male']

logit_model_i = sm.MNLogit(Y, X_combined)
result_i = logit_model_i.fit()
print(result_i.summary())

Optimization terminated successfully.
         Current function value: 0.831265
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:             24144073
Model:                        MNLogit   Df Residuals:                 24144052
Method:                           MLE   Df Model:                           18
Date:                Mon, 18 Nov 2024   Pseudo R-squ.:                0.005380
Time:                        12:48:43   Log-Likelihood:            -2.0070e+07
converged:                       True   LL-Null:                   -2.0179e+07
Covariance Type:            nonrobust   LLR p-value:                     0.000
               y=citation       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
subject_age                  -0.3197      0.001   -294.182      0.000      -0.

In [21]:
#variation inflation factors seem fine now! None are above 10. 

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.Series([variance_inflation_factor(X_combined, i) for i in range(X_combined.shape[1])], 
                index=X_combined.columns)
print(vif)

subject_age                   1.011591
subject_race_cat_black        2.570864
subject_race_cat_hispanic     3.235521
subject_race_cat_white        2.742160
subject_sex_cat_male         28.963818
Male_black                   12.120963
Male_white                   18.930985
dtype: float64
