In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

In [2]:
# Read data files
train_df = pd.read_csv("train.csv", encoding = "latin1")
test_df = pd.read_csv("test.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Select only common columns from both train and test df
common_col = list(set(train_df.columns).intersection(set(test_df.columns)))
common_col.append('compliance')
train_df = train_df[common_col] # We have 28 cols for train and 27 cols for test
train_df = train_df[train_df['compliance'].notnull()]

In [None]:
# Select only USA country
train_df = train_df[train_df['country'] == 'USA']
test_df = test_df[test_df['country'] == 'USA']

In [None]:
# Drop unnecessary columns
drop_col = ['ticket_id', 'admin_fee', 'state_fee', 'violation_zip_code', 
            'mailing_address_str_number', 'mailing_address_str_name', 'inspector_name',
           'zip_code', 'violation_code', 'violator_name', 'violation_street_name', 'violation_street_number', 'city',
           'violation_description', 'grafitti_status', 'non_us_str_code', 'judgment_amount',
            'country', 'ticket_issued_date', 'hearing_date', 'disposition', 'clean_up_cost']
train_df = train_df.drop(drop_col, axis = 1)
test_id = test_df['ticket_id']
test_df = test_df.drop(drop_col, axis = 1)

In [None]:
# Determine data types
train_df['discount_amount'] = train_df['discount_amount'].astype(float)
train_df['fine_amount'] = train_df['fine_amount'].astype(float)
train_df['late_fee'] = train_df['late_fee'].astype(float)
train_df['compliance'] = train_df['compliance'].astype(int)
train_df['agency_name'] = train_df['agency_name'].astype('category')

test_df['discount_amount'] = test_df['discount_amount'].astype(float)
test_df['fine_amount'] = test_df['fine_amount'].astype(float)
test_df['late_fee'] = test_df['late_fee'].astype(float)
test_df['agency_name'] = test_df['agency_name'].astype('category')

In [None]:
#Encode agency names
le = LabelEncoder().fit(train_df['agency_name'])
train_transformed = le.transform(train_df['agency_name'])
test_transformed = le.transform(test_df['agency_name'])

ohe = OneHotEncoder().fit(np.array(train_transformed).reshape(-1, 1))
train_ohe = ohe.transform(np.array(train_transformed).reshape(-1, 1))
test_ohe = ohe.transform(np.array(test_transformed).reshape(-1, 1))
train_ohe_df = pd.DataFrame(train_ohe.toarray())
test_ohe_df = pd.DataFrame(test_ohe.toarray())

In [None]:
#Encode states
train_df['state'].fillna('N/A', inplace = True)
test_df['state'].fillna('N/A', inplace = True)
le2 = LabelEncoder().fit(train_df['state'])
train_transformed2 = le2.transform(train_df['state'])
test_transformed2 = le2.transform(test_df['state'])

ohe2 = OneHotEncoder().fit(np.array(train_transformed2).reshape(-1, 1))
train_ohe2 = ohe2.transform(np.array(train_transformed2).reshape(-1, 1))
test_ohe2 = ohe2.transform(np.array(test_transformed2).reshape(-1, 1))
train_ohe_df2 = pd.DataFrame(train_ohe2.toarray())
test_ohe_df2 = pd.DataFrame(test_ohe2.toarray())

In [None]:
# Merge dataframes
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
train_df_cleaned = pd.concat([train_df, train_ohe_df, train_ohe_df2], axis = 1)
test_df_cleaned = pd.concat([test_df, test_ohe_df, test_ohe_df2], axis = 1)

In [None]:
# Feature engineering
train_df_cleaned['discount'] = train_df_cleaned.apply(lambda x: int(x['discount_amount'] > 0), axis = 1)
train_df_cleaned['late'] = train_df_cleaned.apply(lambda x: int(x['late_fee'] > 0), axis = 1)
test_df_cleaned['discount'] = test_df_cleaned.apply(lambda x: int(x['discount_amount'] > 0), axis = 1)
test_df_cleaned['late'] = test_df_cleaned.apply(lambda x: int(x['late_fee'] > 0), axis = 1)   
train_df_cleaned = train_df_cleaned.drop(['agency_name', 'state', 'late_fee', 'discount_amount'], axis = 1)
test_df_cleaned = test_df_cleaned.drop(['agency_name', 'state', 'late_fee', 'discount_amount'], axis = 1)

In [None]:
# Split into train, test
y = train_df_cleaned['compliance']
X = train_df_cleaned.drop(['compliance'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Run logistic regression and make predictions
gbc = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)
gbc_train = gbc.predict_proba(X_train)
gbc_test = gbc.predict_proba(X_test)   

#train_fpr, train_tpr, _ = metrics.roc_curve(y_train.reshape(-1, 1), gbc_train[:, 1])
#test_fpr, test_tpr, _ = metrics.roc_curve(y_test.reshape(-1, 1), gbc_test[:, 1])
#train_score, test_score = metrics.auc(train_fpr, train_tpr), metrics.auc(test_fpr, test_tpr)

# Generate predictions
gbc_result = gbc.predict_proba(test_df_cleaned)[:, 1]