In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score

from sklearn.dummy import DummyClassifier

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# autocomplete fix because IPYTHON UGH
%config Completer.use_jedi = False

# DATA LOAD AND EXPLORATORY ANALYSIS

In [14]:
# DATA DELETION

    ## columns 12 and 31 have noise (most of them are NaN with 1 or 2 exceptions) I will delete these columns sa well since their data is noise.

    # as a matter of principle our train / test data should have the same amount of input features, thus, I will delete all columns not present in test.csv.
        ## exception, obviously I will keep the target value.

    # to NOT include -> payment_amount, payment_date, payment_status, balance_due, collection_status, compliance_detail
    # to NOT include 2 -> violation_zip_code : Data is all NaN.
    # to NOT include 3 -> violator_name : Data is all unique names, not relevant
    # to NOT include 4 -> since i'm deleting column 31, I will delete its associate column clean_up_cost. All of its values are 0.
    # to NOT include 5 -> late_fee, since it provides information about the feature.
    # to NOT include 6 -> 'mailing_address_str_number', 'mailing_address_str_name', . Sparse address information.
    # to NOT inlcude 7 -> ignoring city, state, zip code and country.
    # to NOT include 8 -> datetime columns -> they screw Gaussian NB

# DATA TRANSFORM
    # column 11 is zip dates, this shouldnt be touched at all. Set to string.
    # "violation_street_number" should be a string, but first i'll pass it as a string to get rid of annoying digits without having to format.
    
data = pd.read_csv("data/train.csv", dtype={"zip_code":str, "violation_street_number":int},
                   usecols=['ticket_id', 'agency_name', 'inspector_name', 'violation_street_number', 'violation_street_name',
                            'violation_code', 'violation_description', 
                            'disposition', 'fine_amount', 'admin_fee', 'state_fee',
                            'discount_amount', 'judgment_amount', 'compliance'])

# normalize the street names
data["violation_street_name"] = data["violation_street_name"].apply(lambda x: x.lower())

# create a column address based on street number and street name
data["address"] = data["violation_street_number"].astype(str) + " " + data["violation_street_name"]

data.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violation_street_number,violation_street_name,violation_code,violation_description,disposition,fine_amount,admin_fee,state_fee,discount_amount,judgment_amount,compliance,address
0,22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie",2900,tyler,9-1-36(a),Failure of owner to obtain certificate of comp...,Responsible by Default,250.0,20.0,10.0,0.0,305.0,0.0,2900 tyler
1,27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin",4311,central,61-63.0600,Failed To Secure Permit For Lawful Use Of Buil...,Responsible by Determination,750.0,20.0,10.0,0.0,855.0,1.0,4311 central
2,22062,"Buildings, Safety Engineering & Env Department","Sims, Martinzie",1449,longfellow,9-1-36(a),Failure of owner to obtain certificate of comp...,Not responsible by Dismissal,250.0,0.0,0.0,0.0,0.0,,1449 longfellow
3,22084,"Buildings, Safety Engineering & Env Department","Sims, Martinzie",1441,longfellow,9-1-36(a),Failure of owner to obtain certificate of comp...,Not responsible by City Dismissal,250.0,0.0,0.0,0.0,0.0,,1441 longfellow
4,22093,"Buildings, Safety Engineering & Env Department","Sims, Martinzie",2449,churchill,9-1-36(a),Failure of owner to obtain certificate of comp...,Not responsible by Dismissal,250.0,0.0,0.0,0.0,0.0,,2449 churchill


In [15]:
# reading lat table
latlons = pd.read_csv("data/latlons.csv")

# dropping a few NA rows previously found
latlons = latlons.dropna()

In [16]:
# simplifying address to just street for generalization purposes
latlons["address"] = latlons["address"].str.extract("(.*),.*")

In [17]:
# creating general purpose dataframe
df = pd.merge(data, latlons, on="address", how="inner")

# eliminating duplicates created during merge
df = df.drop_duplicates(subset="ticket_id")

In [18]:
# creating mask to eliminate all nan values in compliance
not_na_mask = df["compliance"].notna()
df = df[not_na_mask]

In [19]:
# since we already have addresses, i will get rid of the constituting columns and the actual address. Lat Lon should be enough to cover the geo aspects.
# also getting rid of ticket_id, since it's nothing but unique numbers
df = df.drop("violation_street_number", axis=1)
df = df.drop("violation_street_name", axis=1)
df = df.drop("address", axis=1)
df = df.drop("ticket_id", axis=1)

# finally, i'll drop any remaining NAs.
df = df.dropna()

In [20]:
# splitting data 
X = df.drop("compliance", axis=1)
y = df["compliance"]

In [21]:
# creating transformation pipelina for all categorical columns
    ## list(X.select_dtypes("object").columns) gets me all the non numerical columns
ct = ColumnTransformer(
    [("encoder", OneHotEncoder(sparse=False, dtype="int"), list(X.select_dtypes("object").columns))], remainder="passthrough")

In [22]:
# saving array
X2 = ct.fit_transform(X)

In [23]:
# creating sets
X_train, X_test, y_train, y_test = train_test_split(X2, y, train_size=0.8)

In [24]:
# setting dummy classifier
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)

# determining baseline score with mf strategy
roc_auc_score(y_test, dummy.predict(X_test))

0.5

In [31]:
# setting initial classifiers, especially for LARGE DATA as per my notes.
    ## attempting NAIVE BAYES, RANDOM FOREST, GRADIENT BOOSTED DECISION TREES
clf1 = GaussianNB()
clf2 = RandomForestClassifier()

In [29]:
clf1.fit(X_train, y_train)
roc_auc_score(y_test, clf1.predict(X_test))

0.650626367497533

In [33]:
bayes_score = cross_val_score(clf1, X_train, y_train, scoring="roc_auc", n_jobs=-1, cv=7)
bayes_score

array([0.71958555, 0.7143432 , 0.71488412, 0.71583662, 0.69718345,
       0.71231893, 0.70497373])

In [37]:
forest_score = cross_val_score(clf2, X_train, y_train, scoring="roc_auc", n_jobs=-1, cv=3)
forest_score

array([0.77077845, 0.76233445, 0.77094631])

In [39]:
# testing gradient forest
clf3 = GradientBoostingClassifier()

In [40]:
gradient_score = cross_val_score(clf3, X_train, y_train, scoring="roc_auc", n_jobs=-1, cv=3)
gradient_score

array([0.80664496, 0.80156734, 0.8067035 ])