In [1]:
import numpy as np
import pandas as pd

# Importing the Datasets
train = pd.read_csv('readonly/train.csv', encoding="ISO-8859-1", low_memory=False)
test = pd.read_csv('readonly/test.csv', encoding="ISO-8859-1", low_memory=False)

In [2]:
# Columns that are shared between two dataframes
desired_columns = ['inspector_name', 'zip_code', 'disposition', 'fine_amount', 
                   'admin_fee', 'state_fee', 'late_fee']

# Train column
train_columns = desired_columns + ['compliance']

# Columns for onehotencorder
onehot_columns = ['inspector_name', 'zip_code', 'disposition']

# Will be used in the final result as index
result_index = test.loc[:, 'ticket_id']

# Creating view of the existing DataFrame
train = train.loc[:, train_columns]
test = test.loc[:, desired_columns]

In [3]:
### Data cleaning

# Dropping NA or empty values
train.replace('nan', np.nan)
train.dropna(subset=desired_columns + ['compliance'], inplace=True)

# zip_code contained multiple zipcodes that werenot 
# corresponding to our requirements, hence, have been dropped.
train = train[train['zip_code'].str.len() == 5]
train = train[train['zip_code'].str.startswith('48')]

# fine_amount contained multiple missentries of *10 or /10, this is normalizing those errors
train['fine_amount'] = train['fine_amount'].apply(lambda x: x/10 if x>1000 else x)
train['fine_amount'] = train['fine_amount'].apply(lambda x: x*10 if x<10 else x)

train

Unnamed: 0,inspector_name,zip_code,disposition,fine_amount,admin_fee,state_fee,late_fee,compliance
1,"Williams, Darrin",48208,Responsible by Determination,750.0,20,10,75.0,1.0
6,"Williams, Darrin",48038,Responsible by Default,750.0,20,10,75.0,0.0
7,"Williams, Darrin",48211,Responsible by Default,100.0,20,10,10.0,0.0
8,"Williams, Darrin",48211,Responsible by Default,100.0,20,10,10.0,0.0
9,"Williams, Darrin",48205,Responsible by Default,750.0,20,10,75.0,0.0
...,...,...,...,...,...,...,...,...
250277,"McClain, Melvin",48227,Responsible by Default,500.0,20,10,50.0,0.0
250278,"Zizi, Josue",48235,Responsible by Default,200.0,20,10,20.0,0.0
250287,"Lusk, Gertrina",48227,Responsible by Default,1000.0,20,10,100.0,0.0
250288,"Bell, Maydell",48204,Responsible by Default,500.0,20,10,50.0,0.0


In [21]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

X = train[desired_columns] # selecting our columns
y_train = train['compliance'] # target column

# Construction of make_column_transformer with specification of columns to be selected for OHE
columns_tran = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), onehot_columns), 
                                       remainder = 'passthrough')
# For clarification purposes seperated fit and transform, could of used fit_transform function
columns_tran.fit(X)
X_train = columns_tran.transform(X) 

X_train.todense()

matrix([[  0.,   0.,   0., ...,  20.,  10.,  75.],
        [  0.,   0.,   0., ...,  20.,  10.,  75.],
        [  0.,   0.,   0., ...,  20.,  10.,  10.],
        ...,
        [  0.,   0.,   0., ...,  20.,  10., 100.],
        [  0.,   0.,   0., ...,  20.,  10.,  50.],
        [  0.,   0.,   0., ...,  20.,  10.,   0.]])