In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# DATA LOAD AND EXPLORATORY ANALYSIS

In [2]:
# DATA DELETION

    ## columns 12 and 31 have noise (most of them are NaN with 1 or 2 exceptions) I will delete these columns sa well since their data is noise.

    # as a matter of principle our train / test data should have the same amount of input features, thus, I will delete all columns not present in test.csv.
        ## exception, obviously I will keep the target value.

    # to NOT include -> payment_amount, payment_date, payment_status, balance_due, collection_status, compliance_detail
    # to NOT include 2 -> violation_zip_code : Data is all NaN.

# DATA TRANSFORM
    # column 11 is zip dates, this shouldnt be touched at all. Set to string.
    # "violation_street_number" should be a string.
    
data = pd.read_csv("data/train.csv", dtype={"zip_code":str, "violation_street_number":int},
                   usecols=['ticket_id', 'agency_name', 'inspector_name', 'violator_name', 'violation_street_number', 'violation_street_name',
                            'mailing_address_str_number', 'mailing_address_str_name', 'city', 'state', 'zip_code', 'country', 'ticket_issued_date', 'hearing_date',
                            'violation_code', 'violation_description', 'disposition', 'fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount', 'clean_up_cost',
                            'judgment_amount', 'compliance'])

# normalize the street names
data["violation_street_name"] = data["violation_street_name"].apply(lambda x: x.lower())

# create a column address based on street number and street name
data["address"] = data["violation_street_number"].astype(str) + " " + data["violation_street_name"]

data.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,mailing_address_str_number,mailing_address_str_name,city,state,...,disposition,fine_amount,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,compliance,address
0,22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INVESTMENT INC., MIDWEST MORTGAGE",2900,tyler,3.0,S. WICKER,CHICAGO,IL,...,Responsible by Default,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0,2900 tyler
1,27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Michigan, Covenant House",4311,central,2959.0,Martin Luther King,Detroit,MI,...,Responsible by Determination,750.0,20.0,10.0,75.0,0.0,0.0,855.0,1.0,4311 central
2,22062,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","SANDERS, DERRON",1449,longfellow,23658.0,P.O. BOX,DETROIT,MI,...,Not responsible by Dismissal,250.0,0.0,0.0,0.0,0.0,0.0,0.0,,1449 longfellow
3,22084,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","MOROSI, MIKE",1441,longfellow,5.0,ST. CLAIR,DETROIT,MI,...,Not responsible by City Dismissal,250.0,0.0,0.0,0.0,0.0,0.0,0.0,,1441 longfellow
4,22093,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","NATHANIEL, NEAL",2449,churchill,7449.0,CHURCHILL,DETROIT,MI,...,Not responsible by Dismissal,250.0,0.0,0.0,0.0,0.0,0.0,0.0,,2449 churchill


In [3]:
# reading lat table
latlons = pd.read_csv("data/latlons.csv")

# dropping a few NA rows previously found
latlons = latlons.dropna()

In [4]:
# simplifying address to just street for generalization purposes
latlons["address"] = latlons["address"].str.extract("(.*),.*")
latlons["address"]

0         4300 rosa parks blvd
1                 14512 sussex
2                 3456 garland
3                 5787 wayburn
4               5766 haverhill
                  ...         
121764          14267 sorrento
121765            9359 vaughan
121766          5911 courville
121767           14545 wyoming
121768         9269 grandville
Name: address, Length: 121762, dtype: object

In [7]:
df = pd.merge(data, latlons, on="address", how="inner")
df = df.drop_duplicates(subset="ticket_id")

In [8]:
df.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,mailing_address_str_number,mailing_address_str_name,city,state,...,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,compliance,address,lat,lon
0,22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INVESTMENT INC., MIDWEST MORTGAGE",2900,tyler,3.0,S. WICKER,CHICAGO,IL,...,20.0,10.0,25.0,0.0,0.0,305.0,0.0,2900 tyler,42.390729,-83.124268
1,77242,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INV. INC., MIDWEST MORTGAGE",2900,tyler,3.0,S. WACKER,CHICAGO,IL,...,20.0,10.0,50.0,0.0,0.0,580.0,0.0,2900 tyler,42.390729,-83.124268
2,77243,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","MORTGAGE CO., MIDWEST",2900,tyler,3.0,S. WACKER,CHICAGO,IL,...,20.0,10.0,25.0,0.0,0.0,305.0,0.0,2900 tyler,42.390729,-83.124268
3,103945,Department of Public Works,"Bennett, Margaret","INVESTMENT, INC., MIDWEST MORTGAGE",2900,tyler,3.0,S. WACKER DRIVE SUITE 1418,CHICAGO,IL,...,0.0,0.0,0.0,0.0,0.0,0.0,,2900 tyler,42.390729,-83.124268
4,138219,Department of Public Works,"Talbert, Reginald","INVESTMENT INC, MIDWEST MORTGAGE",2900,tyler,3.0,S WACKER #1418,CHICAGO,IL,...,20.0,10.0,10.0,0.0,0.0,140.0,0.0,2900 tyler,42.390729,-83.124268


In [10]:
df["compliance"].notna()

0          True
1          True
2          True
3         False
4          True
          ...  
269662    False
269663    False
269664    False
269665    False
269666    False
Name: compliance, Length: 250303, dtype: bool

In [17]:
# creating mask to eliminate all nan values in compliance
not_na_mask = df["compliance"].notna()
df = df[not_na_mask]

In [18]:
df.isnull().sum()

ticket_id                        0
agency_name                      0
inspector_name                   0
violator_name                   26
violation_street_number          0
violation_street_name            0
mailing_address_str_number    2558
mailing_address_str_name         3
city                             0
state                           84
zip_code                         1
country                          0
ticket_issued_date               0
hearing_date                   227
violation_code                   0
violation_description            0
disposition                      0
fine_amount                      0
admin_fee                        0
state_fee                        0
late_fee                         0
discount_amount                  0
clean_up_cost                    0
judgment_amount                  0
compliance                       0
address                          0
lat                              0
lon                              0
dtype: int64