In [21]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, auc, roc_curve
import warnings
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import SGDClassifier
#from sklearn.svm import SVC
warnings.filterwarnings("ignore")

### Import Train and Test data sets

In [4]:
#train_data_path = '../data/train.csv'
train_data_path = 'train.csv'
train_df = pd.read_csv(train_data_path, encoding = 'ISO-8859-1')
#train_df.head()

#test_data_path = '../data/test.csv'
test_data_path = 'test.csv'
test_df = pd.read_csv(test_data_path, encoding = 'ISO-8859-1')
#test_df.head()

### Remove null compliance values for y_pred submission

In [5]:
good_rows = train_df.compliance.notnull()
train_clean = train_df[good_rows]

### Helper Functions to create new features

In [6]:
def datetime_split_month(cell):
    day_time = cell.split(" ")
    month = day_time[0]
    return month

def datetime_split_time(cell):
    day_time = cell.split(" ")
    time = day_time[1]
    return time

def weekday_end(x):
    if x < 5:
        return 0
    else:
        return 1

def bucketize_zipcodes(x):
    rate = compliance_by_zip[x]
    if rate == 0.:
        return 0
    elif ((rate > 0.) & (rate <= 0.08)):
        return 1
    elif ((rate > 0.075) & (rate <= .2)):
        return 2
    elif ((rate > .2) & (rate <= 0.5)):
        return 3
    else:
        return 4

def to_categorical(df, column):
    df[column] = df[column].astype('category').cat.codes
    df[column] = df[column].astype('category')
    
def bucketize_test_zipcode(code):
    if code not in zip_bucket_dict.keys():
        value = 0
    else:
        value = zip_bucket_dict[code] 
    return value

def compliance_impact(group_name, df):
    compliance_by_ = {}
    for g,f in df.groupby(group_name):
        yes = sum(f.compliance)
        no = f.shape[0] - sum(f.compliance)
        rate = float(yes)/ f.shape[0]
        overall_rate = yes / total_compliant
        compliance_by_[g] = rate#,overall_rate
#        print('Fine Amount {}'.format(g))
#        print('Count {}'.format(f.shape[0]))
#        print('Compliant : Yes {}, No {}'.format(yes,no))
#        print('Compliance Rate {}'z.format(rate))
#        print('Overall Compliant Contribution {}'.format(overall_rate))
#        print()
    return compliance_by_

### Create New Features for Analysis

#### Create Month, Time and Weekday Features - TRAIN set

In [7]:
train_clean['ticket_issued_month'] = train_clean.ticket_issued_date.apply(datetime_split_month)
train_clean['ticket_issued_time'] = train_clean.ticket_issued_date.apply(datetime_split_time)
train_clean['month'] = pd.to_datetime(train_clean['ticket_issued_month']).dt.month
train_clean['day'] = pd.to_datetime(train_clean['ticket_issued_date']).dt.dayofweek
train_clean['time'] = pd.to_datetime(train_clean['ticket_issued_time']).dt.hour
train_clean['weekday'] = train_clean.day.apply(weekday_end)

#### Create Month, Time and Weekday Features - TEST set

In [8]:
test_df['ticket_issued_month'] = test_df.ticket_issued_date.apply(datetime_split_month)
test_df['ticket_issued_time'] = test_df.ticket_issued_date.apply(datetime_split_time)
test_df['month'] = pd.to_datetime(test_df['ticket_issued_month']).dt.month
test_df['day'] = pd.to_datetime(test_df['ticket_issued_date']).dt.dayofweek
test_df['time'] = pd.to_datetime(test_df['ticket_issued_time']).dt.hour
test_df['weekday'] = test_df.day.apply(weekday_end)

#### Create Zipcode Buckets Feature

In [9]:
# impute 0 as zipcode on missing value
train_clean.zip_code[train_clean.zip_code.isnull()] = 0
total_compliant = train_clean.compliance.value_counts()[1]
compliance_by_zip = compliance_impact('zip_code', train_clean)
train_clean['zip_buckets'] = train_clean.zip_code.apply(bucketize_zipcodes)


# create dictionary to hold zipcode buckets
test_df.zip_code[test_df.zip_code.isnull()] = 0
zip_bucket_dict = {}
for code, bucket in zip(train_clean.zip_code, train_clean.zip_buckets):
    zip_bucket_dict[code] = bucket
test_df['zip_buckets'] = test_df.zip_code.apply(bucketize_test_zipcode)

#### Create TimeDiff Feature which is Difference between Ticket Issue Date and Hearing Date

In [10]:
train_clean['hearing_date'] = pd.to_datetime(train_clean.hearing_date)
test_df['hearing_date'] = pd.to_datetime(test_df.hearing_date)
train_clean['ticket_issued_date'] = pd.to_datetime(train_clean.ticket_issued_date)
test_df['ticket_issued_date'] = pd.to_datetime(test_df.ticket_issued_date)

In [11]:
train_clean['time_diff'] = train_clean['hearing_date'].dt.date - train_clean['ticket_issued_date'].dt.date
train_clean['time_diff'] = pd.to_numeric(train_clean['time_diff'])

In [12]:
test_df['time_diff'] = test_df['hearing_date'].dt.date - test_df['ticket_issued_date'].dt.date
test_df['time_diff'] = pd.to_numeric(test_df['time_diff'])

### Convert Features to Categorical Values

In [13]:
to_categorical(train_clean,'agency_name')
to_categorical(test_df,'agency_name')

to_categorical(train_clean,'month')
to_categorical(test_df,'month')

to_categorical(train_clean,'time')
to_categorical(test_df,'time')

to_categorical(train_clean,'day')
to_categorical(test_df,'day')

### Select Features for Analysis in Model

In [14]:
model_cols = ['fine_amount', 'month', 'time', 'day','zip_buckets', 'agency_name', 'time_diff']

### Create Train/Test Splits

In [15]:
X = train_clean[model_cols]
y = train_clean['compliance']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159880 entries, 0 to 250293
Data columns (total 7 columns):
fine_amount    159880 non-null float64
month          159880 non-null category
time           159880 non-null category
day            159880 non-null category
zip_buckets    159880 non-null int64
agency_name    159880 non-null category
time_diff      159880 non-null int64
dtypes: category(4), float64(1), int64(2)
memory usage: 5.5 MB


### Instantiate and Fit Model and Calculate AUC

In [17]:
#model_list = [GradientBoostingClassifier(random_state = 0),
#              RandomForestClassifier(random_state = 0)]
#              SGDClassifier(random_state = 0),
#              SVC()]
#for model in model_list:
#m = GradientBoostingClassifier(random_state = 0)
#y_score_m = m.fit(X_train, y_train).decision_function(X_test)
#fpr_m, tpr_m, _ = roc_curve(y_test, y_score_m)
#auc(fpr_m, tpr_m)

0.71816302808098387

### Fit Model with Entire TRAIN set data and Predict Test Labels

In [18]:
#m.fit(X, y)
#test_features = test_df[model_cols]
#y_pred_proba = m.predict_proba(test_features)

In [22]:
m = RandomForestClassifier(random_state = 0, )
m.fit(X, y)
test_features = test_df[model_cols]
y_pred_proba = m.predict_proba(test_features)

### Run predict_proba for submission

In [23]:
def blight_model():    
    y_pred = []
    for pred in y_pred_proba:
        y_pred.append(pred[1])
    y_pred = pd.Series(y_pred)
    y_pred.index = test_df.ticket_id
#    print(sum([1 for each in y_pred if each > .5]))
    return y_pred
blight_model()

3878


ticket_id
284932    0.200000
285362    0.000000
285361    0.150000
285338    0.266667
285346    0.125000
285345    0.000000
285347    0.200000
285342    0.800000
285530    0.000000
284989    0.000000
285344    0.066667
285343    0.000000
285340    0.000000
285341    0.100000
285349    0.200000
285348    0.100000
284991    0.000000
285532    0.000000
285406    0.000000
285001    0.000000
285006    0.000000
285405    0.100000
285337    0.000000
285496    0.000000
285497    0.000000
285378    0.000000
285589    0.100000
285585    0.200000
285501    0.100000
285581    0.000000
            ...   
376367    0.300000
376366    0.000000
376362    0.000000
376363    0.000000
376365    0.300000
376364    0.000000
376228    0.540000
376265    0.000000
376286    0.200000
376320    0.000000
376314    0.000000
376327    0.056978
376385    0.000000
376435    0.000000
376370    0.514286
376434    0.200000
376459    0.000000
376478    0.000000
376473    0.200000
376484    0.050000
376482    0.050000
37