## Feature Optimization for Logistic Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [272]:
#Importing data
openpolicing_path="C:/Users/SwetaMankala/Desktop/Assignments/EAI6000/ma_statewide_2020_04_01.csv"

data=pd.read_csv(openpolicing_path,low_memory=False)

In [273]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 24 columns):
raw_row_number                int64
date                          object
location                      object
county_name                   object
subject_age                   float64
subject_race                  object
subject_sex                   object
type                          object
arrest_made                   object
citation_issued               object
outcome                       object
contraband_found              object
contraband_drugs              object
contraband_weapons            object
contraband_alcohol            bool
contraband_other              object
frisk_performed               object
search_conducted              bool
search_basis                  object
reason_for_stop               object
vehicle_type                  object
vehicle_registration_state    object
raw_Race                      object
dtypes: bool(2), float64(1), int64(1), o

In [274]:
##Convert date to datetype
data.date
data["date"] = pd.to_datetime(data.date, format="%Y-%m-%d")
#Extract time
data['time'] = [d.time() for d in data['date']]
#Convert date back to original form
data['date'] = pd.to_datetime(data['date']).dt.date

In [275]:
data.head(5)

Unnamed: 0,raw_row_number,date,location,county_name,subject_age,subject_race,subject_sex,type,arrest_made,citation_issued,...,contraband_alcohol,contraband_other,frisk_performed,search_conducted,search_basis,reason_for_stop,vehicle_type,vehicle_registration_state,raw_Race,time
0,1,2007-06-06,MIDDLEBOROUGH,Plymouth County,33.0,white,male,vehicular,False,True,...,False,,,False,,Speed,Passenger,MA,White,00:00:00
1,2,2007-06-07,SEEKONK,Bristol County,36.0,white,male,vehicular,False,False,...,False,False,False,True,other,,Commercial,MA,White,00:00:00
2,3,2007-06-07,MEDFORD,Middlesex County,56.0,white,female,vehicular,False,False,...,False,,,False,,,Passenger,MA,White,00:00:00
3,4,2007-06-07,MEDFORD,Middlesex County,37.0,white,male,vehicular,False,False,...,False,,,False,,,Commercial,MA,White,00:00:00
4,5,2007-06-07,EVERETT,Middlesex County,22.0,hispanic,female,vehicular,False,True,...,False,,,False,,,Commercial,MA,Hispanic,00:00:00


In [276]:
#Convering race, sex, location etc to categorical
data.iloc[:,np.r_[2:4,5:8,9:12,19:23]]=data.iloc[:,np.r_[2:4,5:8,9:12,19:23]].astype("category")

In [277]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 25 columns):
raw_row_number                int64
date                          object
location                      category
county_name                   category
subject_age                   float64
subject_race                  category
subject_sex                   category
type                          category
arrest_made                   object
citation_issued               category
outcome                       category
contraband_found              object
contraband_drugs              object
contraband_weapons            object
contraband_alcohol            bool
contraband_other              object
frisk_performed               object
search_conducted              bool
search_basis                  category
reason_for_stop               category
vehicle_type                  category
vehicle_registration_state    category
raw_Race                      object
time              

In [278]:
df = pd.DataFrame(data)
median1 = df['subject_age'].median()
df['subject_age'].fillna(median1, inplace = True)

In [279]:
df.isnull().sum()

raw_row_number                      0
date                                0
location                         6666
county_name                      6666
subject_age                         0
subject_race                     1664
subject_sex                     15623
type                                0
arrest_made                       916
citation_issued                   916
outcome                          6858
contraband_found              3360508
contraband_drugs              3360508
contraband_weapons            3360508
contraband_alcohol                  0
contraband_other              3360508
frisk_performed               3361607
search_conducted                    0
search_basis                  3365209
reason_for_stop               1659589
vehicle_type                     4963
vehicle_registration_state       9814
raw_Race                         1664
time                                0
dtype: int64

## Fill NAs and normalize columns:

In [280]:
#Segregate the values based on the categories, remove the nulls and normalize the data column
df['race'] = pd.Series(len(df['subject_race']), index=df.index)
df['race'] = 0

#To assign null values
df.loc[(df['subject_race'] != 'hispanic') | 
           (df['subject_race'] != 'white') |
           (df['subject_race'] != 'black') |
           (df['subject_race'] != 'asian/pacific islander') |
           (df['subject_race'] != 'other') |
           (df['subject_race'].isnull() == True), 'race'] = np.nan

#To assign the categorical values to the dataframe 'race'
df.loc[(df['subject_race'] == 'hispanic') | 
           (df['subject_race'] == 'white') |
           (df['subject_race'] == 'black') |
           (df['subject_race'] == 'other') |
           (df['subject_race'] == 'asian/pacific islander'), 'race'] = df['subject_race']

race_copy = df['race'].copy(deep = True)

# Fill NaN values.
df['race'].fillna(value = 1, inplace = True)

# Obtain values for every race.Axis=0 for rows
race_copy.dropna(axis = 0, inplace = True)
sorted_race = race_copy.value_counts(normalize = True).sort_index()

# Fill one values for individual person with randomly picked from random choice.
df['race'] = df['race'].apply(lambda x: np.random.choice([x for x in sorted_race.index],
                                replace = True, p = sorted_race) if (x == 1) else x).astype(str)

#Normalize=True prints the relative frequency of the values
print("\nFilled NaNs normalized:\n", df['race'].value_counts(normalize = True))

df['subject_race'] = df['race']
df['subject_race'].value_counts()


Filled NaNs normalized:
 white                     0.744604
black                     0.103486
hispanic                  0.099563
asian/pacific islander    0.049106
other                     0.003242
Name: race, dtype: float64


white                     2543743
black                      353532
hispanic                   340130
asian/pacific islander     167759
other                       11074
Name: subject_race, dtype: int64

In [281]:
#Segregate the values based on the categories, remove the nulls and normalize the data column
df['sex'] = pd.Series(len(df['subject_sex']), index = df.index)
df['sex'] = 0

# Randomly stick sex to every user with NaN value.
df.loc[(df['subject_sex'] != 'male') | 
           (df['subject_sex'] != 'female') |
           (df['subject_sex'].isnull() == True), 'sex'] = np.nan
df.loc[(df['subject_sex'] == 'male') | 
           (df['subject_sex'] == 'female'), 'sex'] = df['subject_sex']


# Create a copy to calculate proportions.
sex_copy = df['sex'].copy(deep = True)

# Fill NaN values.
df['sex'].fillna(value = 1, inplace = True)

# Obtain values for every sex.
sex_copy.dropna(axis = 0, inplace = True)
sorted_sex = sex_copy.value_counts(normalize = True).sort_index()

# Fill one values in suspector_sex_rand with randomly picked from random choice.
df['sex'] = df['sex'].apply(lambda x: np.random.choice([x for x in sorted_sex.index],
                                replace = True, p = sorted_sex) if (x == 1) else x).astype(str)
print("Gender proportions after filled NaNs: \n", df['sex'].value_counts(normalize = True))

df['subject_sex'] = df['sex']
df['subject_sex'].value_counts()

Gender proportions after filled NaNs: 
 male      0.694644
female    0.305356
Name: sex, dtype: float64


male      2373068
female    1043170
Name: subject_sex, dtype: int64

In [282]:
#Segregate the values based on the categories, remove the nulls and normalize the data column
df['outcome_v'] = pd.Series(len(df['outcome']), index = df.index)
df['outcome_v'] = 0

# Randomly stick sex to every user with NaN value.
df.loc[(df['outcome'] != 'citation') | 
           (df['outcome'] != 'warning') |
           (df['outcome'] != 'arrest') |
           (df['outcome'].isnull() == True), 'outcome_v'] = np.nan
df.loc[(df['outcome'] != 'citation') | 
           (df['outcome'] != 'warning') |
           (df['outcome'] != 'arrest'), 'outcome_v'] = df['outcome']


# Create a copy to calculate proportions.
outcome_copy = df['outcome_v'].copy(deep = True)

# Fill NaN values.
df['outcome_v'].fillna(value = 1, inplace = True)

outcome_copy.dropna(axis = 0, inplace = True)
sorted_outcome = outcome_copy.value_counts(normalize = True).sort_index()

# Fill one values in suspector_sex_rand with randomly picked from random choice.
df['outcome_v'] = df['outcome_v'].apply(lambda x: np.random.choice([x for x in sorted_outcome.index],
                                replace = True, p = sorted_outcome) if (x == 1) else x).astype(str)
print("Outcome proportions after filled NaNs: \n", df['outcome_v'].value_counts(normalize = True))

df['outcome'] = df['outcome_v']
df['outcome'].value_counts()

Outcome proportions after filled NaNs: 
 citation    0.636869
arrest      0.026986
Name: outcome_v, dtype: float64


citation    2175695
arrest        92192
Name: outcome, dtype: int64

In [283]:
#Segregate the values based on the categories, remove the nulls and normalize the data column
df['vehicle'] = pd.Series(len(df['vehicle_type']), index = df.index)
df['vehicle'] = 0

df.loc[(df['vehicle_type'] != 'Commerical') | 
           (df['vehicle_type'] != 'Passenger') |
           (df['vehicle_type'] != 'Motorcycle') |
           (df['vehicle_type'] != 'Taxi/Livery') |
           (df['vehicle_type'] != 'Trailer') |
           (df['vehicle_type'].isnull() == True), 'vehicle'] = np.nan
df.loc[(df['vehicle_type'] != 'Commerical') | 
           (df['vehicle_type'] != 'Passenger') |
           (df['vehicle_type'] != 'Motorcycle') |
           (df['vehicle_type'] != 'Taxi/Livery') |
           (df['vehicle_type'] != 'Trailer'), 'vehicle'] = df['vehicle_type']


# Create a copy to calculate proportions.
outcome_copy = df['vehicle'].copy(deep = True)

# Fill NaN values.
df['vehicle'].fillna(value = 1, inplace = True)

outcome_copy.dropna(axis = 0, inplace = True)
sorted_outcome = outcome_copy.value_counts(normalize = True).sort_index()

# Fill one values in suspector_sex_rand with randomly picked from random choice.
df['vehicle'] = df['vehicle'].apply(lambda x: np.random.choice([x for x in sorted_outcome.index],
                                replace = True, p = sorted_outcome) if (x == 1) else x).astype(str)
print("Vehicle Type proportions after filled NaNs: \n", df['vehicle'].value_counts(normalize = True))

df['vehicle_type'] = df['vehicle']
df['vehicle_type'].value_counts()

Vehicle Type proportions after filled NaNs: 
 Passenger      9.312852e-01
Commercial     5.693075e-02
Taxi/Livery    5.728231e-03
Motorcycle     3.671583e-03
Trailer        2.383616e-03
66             2.927197e-07
91             2.927197e-07
Name: vehicle, dtype: float64


Passenger      3181492
Commercial      194489
Taxi/Livery      19569
Motorcycle       12543
Trailer           8143
66                   1
91                   1
Name: vehicle_type, dtype: int64

In [289]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = pd.to_numeric(df['date'])
df['date'] = df['date'].astype(float)

## Use make_dummies for One Hot Encoding:

In [309]:
categorical_vars = ['subject_race',
                    'subject_sex',
                    'arrest_made',
                    'citation_issued',
                    'outcome',
                    'warning_issued',
                    'contraband_found',
                    'contraband_drugs',
                    'contraband_weapons',
                    'contraband_alcohol',
                    'contraband_other',
                    'frisk_performed',
                    'search_conducted',
                    'search_basis',
                    'reason_for_stop', 
                    'vehicle_type',
                    'vehicle_registration_state']

def make_dummies(dataset, dummy_list):
    for i in dummy_list:
        dummy = pd.get_dummies(dataset[i], prefix= i, dummy_na= False)
        dataset = dataset.drop(i,1)
        dataset = pd.concat([dataset,dummy], axis = 1)
    return dataset

dummy_data =make_dummies(df,categorical_vars)

print(dummy_data.head(5))

   raw_row_number          date       location       county_name  subject_age  \
0               1  1.181088e+18  MIDDLEBOROUGH   Plymouth County         33.0   
1               2  1.181174e+18        SEEKONK    Bristol County         36.0   
2               3  1.181174e+18        MEDFORD  Middlesex County         56.0   
3               4  1.181174e+18        MEDFORD  Middlesex County         37.0   
4               5  1.181174e+18        EVERETT  Middlesex County         22.0   

        type  raw_Race      time      race     sex  ...  \
0  vehicular     White  00:00:00     white    male  ...   
1  vehicular     White  00:00:00     white    male  ...   
2  vehicular     White  00:00:00     white  female  ...   
3  vehicular     White  00:00:00     white    male  ...   
4  vehicular  Hispanic  00:00:00  hispanic  female  ...   

  vehicle_registration_state_SD vehicle_registration_state_TN  \
0                             0                             0   
1                           

In [310]:
new_data=pd.concat([df['subject_race'],dummy_data],axis=1)

In [312]:
new_data.drop(['raw_row_number','date','location','county_name', 'vehicle', 'time', 'race', 'sex', 'outcome_v', 'type', 'time', 'raw_Race', 'race'], axis=1, inplace=True)

In [313]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 100 columns):
subject_race                                object
subject_age                                 float64
subject_race_asian/pacific islander         uint8
subject_race_black                          uint8
subject_race_hispanic                       uint8
subject_race_other                          uint8
subject_race_white                          uint8
subject_sex_female                          uint8
subject_sex_male                            uint8
arrest_made_False                           uint8
arrest_made_True                            uint8
citation_issued_False                       uint8
citation_issued_True                        uint8
outcome_arrest                              uint8
outcome_citation                            uint8
contraband_found_False                      uint8
contraband_found_True                       uint8
contraband_drugs_False           

In [314]:
X = new_data.drop(['subject_race'], axis= 1)
#X = X.values.reshape(-1,1)

Y = new_data['subject_race']

## Train Model with Multi-class Logistic Regression

We determine the the subject's race based on the factors present in our data.

In [320]:
#Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [347]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C=100, solver='liblinear', random_state = 0)
lr_fit = classifier.fit(X_Train, Y_Train)

In [348]:
lr_fit

LogisticRegression(C=100, random_state=0, solver='liblinear')

In [322]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Predicting the train set results
Y_Pred = classifier.predict(X_Train)

# Making the Confusion Matrix 
print(confusion_matrix(Y_Train, Y_Pred))
print(classification_report(Y_Train, Y_Pred))
print("Train accuracy:", accuracy_score(Y_Train, Y_Pred))

[[ 117496       0       0       0       0]
 [      0  247409       0       0       0]
 [      0       0  238240       0       0]
 [      0       0       0    7731       0]
 [      0       0       0       0 1780490]]
                        precision    recall  f1-score   support

asian/pacific islander       1.00      1.00      1.00    117496
                 black       1.00      1.00      1.00    247409
              hispanic       1.00      1.00      1.00    238240
                 other       1.00      1.00      1.00      7731
                 white       1.00      1.00      1.00   1780490

              accuracy                           1.00   2391366
             macro avg       1.00      1.00      1.00   2391366
          weighted avg       1.00      1.00      1.00   2391366

Train accuracy: 1.0


In [323]:
Pred = classifier.predict(X_Test)
print(confusion_matrix(Y_Test, Pred))
print(classification_report(Y_Test, Pred))
print("Train accuracy:", accuracy_score(Y_Test, Pred))

[[ 50263      0      0      0      0]
 [     0 106123      0      0      0]
 [     0      0 101890      0      0]
 [     0      0      0   3343      0]
 [     0      0      0      0 763253]]
                        precision    recall  f1-score   support

asian/pacific islander       1.00      1.00      1.00     50263
                 black       1.00      1.00      1.00    106123
              hispanic       1.00      1.00      1.00    101890
                 other       1.00      1.00      1.00      3343
                 white       1.00      1.00      1.00    763253

              accuracy                           1.00   1024872
             macro avg       1.00      1.00      1.00   1024872
          weighted avg       1.00      1.00      1.00   1024872

Train accuracy: 1.0


In [371]:
labels = new_data['vehicle_registration_state_WY'].values
dataLR = new_data.drop(['vehicle_registration_state_WY'], axis=1)

In [386]:
dataLR.drop(['subject_race'], axis=1, inplace=True)

## Logistic Regression using Gradient Descent

In [372]:
def Regularized_gradient_descent(data, labels, y, weights, eta, lambda_par):

    for k in range(len(data.columns)+1): # For the weights
        
        if k == 99:
            summation = (labels - y).sum()
            weights[k] = weights[k] + eta*summation
            
        else:        
            summation = ((labels - y)*data[k]).sum()
            weights[k] = weights[k] + eta*(summation - lambda_par*weights[k])

    return weights

In [373]:
def Regularized_cross_entropy_err(labels, y, weights, lambda_par):
    error = 0
    for i in range(len(y)):
        if y[i] < (np.exp(-16)):
            y[i] = np.exp(-16)
           
        if y[i] > 1 - np.exp(-16):
            y[i] = (1 - np.exp(-16))
            
        error += (labels[i] * math.log(y[i])) + ((1 - labels[i]) * math.log(1 - y[i]))
    reg_ex = (lambda_par/2)*(weights**2).sum()
    cross_entropy_err = -error + reg_ex
            
    return cross_entropy_err

In [390]:
import math

def Reg_Logistic_Regression(data, labels, iterations, eta, lambda_par):
    
    master_err = []
    intial_weight = 0.5
    weights = np.array([intial_weight]*99)
    
    for i in range(iterations):
        
        
        y = np.empty(len(data))    
        
        #Iterating over the data
        for t in range(len(data)): # No. of examples
            
            prod = np.matmul(weights[:98], data.iloc[t])
                
            gx = prod + weights[98]
            if -gx > 700:
                gx = -700
            #print gx
        
            sigmoid = 1 / (1 + math.exp(-gx))
            y[t] = sigmoid
            
        weights = Regularized_gradient_descent(data, labels, y, weights, eta, lambda_par)
        
        # Cross entropy error
        error = Regularized_cross_entropy_err(labels, y, weights, lambda_par)
        master_err.append(error)    
        
    # Predicting from final weights
    pred = predict(data, weights)
    
    # Cross entropy error
    error = Regularized_cross_entropy_err(labels, y, weights, lambda_par)
        
    # Training error
    cl_err = classification_error(labels, pred)
    
    # L2 - norm
    l2_norm = np.sqrt((weights ** 2).sum())
    
    return cl_err, master_err, l2_norm

In [381]:
def classification_error(labels, predictions):
    wrong = 0.0
    for i in range(len(labels)):
        if labels[i] != predictions[i]:
            wrong += 1
    return wrong/len(labels)*100

In [376]:
def predict(data, weights):
    
    pred = np.empty(len(data))
    
    for t in range(len(data)): # No. of examples
        
        prod = np.matmul(weights[:99], data.iloc[t])
        gx = prod + weights[99]

        sigmoid = 1 / (1 + math.exp(-gx))
        
        # Classifying
        if sigmoid < 0.5:
            y = 0
        else:
            y = 1

        pred[t] = y
        
    return pred

For different values of eta, keeping lambda = 0 for now

In [None]:
eta_values = [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 1.5]
lambda_par = 0

for eta in eta_values: 
    print("eta =", eta)   
    tr_err, ce_error, l2 = Reg_Logistic_Regression(dataLR, labels, 50, eta, lambda_par)
    print("Training Error:\t\t\t", tr_err)
    #print("Cross-Entropy Error:\t\t", ce_error[len(ce_error)-1])
    print("L2-Norm (||w||_2):\t\t", l2, "\n")
    
    # Plotting the figures of Cross Entropy vs Iterations at different values of 'eta'
    plt.figure(figsize=(12, 8))
    plt.xlabel("No. of Iterations")
    plt.ylabel("Cross Entropy Error")
    plt.plot(ce_error)
    plt.title("Gradient Descent for eta = {}".format(eta))

eta = 0.001
