In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, auc, roc_curve
import warnings
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import SGDClassifier
#from sklearn.svm import SVC
#warnings.filterwarnings("ignore")

### Import Train and Test data sets

In [3]:
#train_data_path = '../data/train.csv'
train_data_path = 'train.csv'
train_df = pd.read_csv(train_data_path, encoding = 'ISO-8859-1')
#train_df.head()

#test_data_path = '../data/test.csv'
test_data_path = 'test.csv'
test_df = pd.read_csv(test_data_path, encoding = 'ISO-8859-1')
#test_df.head()

### Remove null compliance values for y_pred submission

In [4]:
good_rows = train_df.compliance.notnull()
train_clean = train_df[good_rows]

### Helper Functions to create new features

In [50]:
def datetime_split_month(cell):
    day_time = cell.split(" ")
    month = day_time[0]
    return month

def datetime_split_time(cell):
    day_time = cell.split(" ")
    time = day_time[1]
    return time

def weekday_end(x):
    if x < 5:
        return 0
    else:
        return 1

def bucketize_zipcodes(x):
    rate = compliance_by_zip[x]
    if rate == 0.:
        return 0
    elif ((rate > 0.) & (rate <= 0.08)):
        return 1
    elif ((rate > 0.075) & (rate <= .2)):
        return 2
    elif ((rate > .2) & (rate <= 0.5)):
        return 3
    else:
        return 4

def to_categorical(df, column):
    df[column] = df[column].astype('category').cat.codes
    df[column] = df[column].astype('category')
    
def bucketize_test_zipcode(code):
    if code not in zip_bucket_dict.keys():
        value = 0
    else:
        value = zip_bucket_dict[code] 
    return value

def compliance_impact(group_name, df):
    compliance_by_ = {}
    for g,f in df.groupby(group_name):
        yes = sum(f.compliance)
        no = f.shape[0] - sum(f.compliance)
        rate = float(yes)/ f.shape[0]
        overall_rate = yes / total_compliant
        compliance_by_[g] = rate#,overall_rate
#        print('Fine Amount {}'.format(g))
#        print('Count {}'.format(f.shape[0]))
#        print('Compliant : Yes {}, No {}'.format(yes,no))
#        print('Compliance Rate {}'z.format(rate))
#        print('Overall Compliant Contribution {}'.format(overall_rate))
#        print()
    return compliance_by_

def good_samaritans(name):
    if name not in people_who_pay.keys():
        return 0
    else:
        payrate = people_who_pay[name]
        if payrate < 0.8:
            return 1
        elif 0.8 < payrate <= .2:
            return 2
        elif 0.2 < payrate <= .3:
            return 3
        elif 0.3 < payrate <= .4:
            return 4
        elif 0.4 < payrate <= .5:
            return 5
        elif 0.5 < payrate <= .6:
            return 6
        elif 0.6 < payrate <= .7:
            return 7
        elif 0.7 < payrate <= .8:
            return 8
        elif 0.8 < payrate <= .9:
            return 9
        else:
            return 10

### Create New Features for Analysis

#### Create Month, Time and Weekday Features - TRAIN set

In [6]:
train_clean['ticket_issued_month'] = train_clean.ticket_issued_date.apply(datetime_split_month)
train_clean['ticket_issued_time'] = train_clean.ticket_issued_date.apply(datetime_split_time)
train_clean['month'] = pd.to_datetime(train_clean['ticket_issued_month']).dt.month
train_clean['day'] = pd.to_datetime(train_clean['ticket_issued_date']).dt.dayofweek
train_clean['time'] = pd.to_datetime(train_clean['ticket_issued_time']).dt.hour
train_clean['weekday'] = train_clean.day.apply(weekday_end)

#### Create Month, Time and Weekday Features - TEST set

In [7]:
test_df['ticket_issued_month'] = test_df.ticket_issued_date.apply(datetime_split_month)
test_df['ticket_issued_time'] = test_df.ticket_issued_date.apply(datetime_split_time)
test_df['month'] = pd.to_datetime(test_df['ticket_issued_month']).dt.month
test_df['day'] = pd.to_datetime(test_df['ticket_issued_date']).dt.dayofweek
test_df['time'] = pd.to_datetime(test_df['ticket_issued_time']).dt.hour
test_df['weekday'] = test_df.day.apply(weekday_end)

#### Create Zipcode Buckets Feature

In [8]:
# impute 0 as zipcode on missing value
train_clean.zip_code[train_clean.zip_code.isnull()] = 0
total_compliant = train_clean.compliance.value_counts()[1]
compliance_by_zip = compliance_impact('zip_code', train_clean)
train_clean['zip_buckets'] = train_clean.zip_code.apply(bucketize_zipcodes)


# create dictionary to hold zipcode buckets
test_df.zip_code[test_df.zip_code.isnull()] = 0
zip_bucket_dict = {}
for code, bucket in zip(train_clean.zip_code, train_clean.zip_buckets):
    zip_bucket_dict[code] = bucket
test_df['zip_buckets'] = test_df.zip_code.apply(bucketize_test_zipcode)

#### Create TimeDiff Feature which is Difference between Ticket Issue Date and Hearing Date

In [9]:
train_clean['hearing_date'] = pd.to_datetime(train_clean.hearing_date)
test_df['hearing_date'] = pd.to_datetime(test_df.hearing_date)
train_clean['ticket_issued_date'] = pd.to_datetime(train_clean.ticket_issued_date)
test_df['ticket_issued_date'] = pd.to_datetime(test_df.ticket_issued_date)

In [10]:
train_clean['time_diff'] = train_clean['hearing_date'].dt.date - train_clean['ticket_issued_date'].dt.date
train_clean['time_diff'] = pd.to_numeric(train_clean['time_diff'])

In [11]:
test_df['time_diff'] = test_df['hearing_date'].dt.date - test_df['ticket_issued_date'].dt.date
test_df['time_diff'] = pd.to_numeric(test_df['time_diff'])

### Feature Exploration - Violator Name

In [27]:
len(train_clean['violator_name'][train_clean['compliance'] == 1].unique())

11597

In [28]:
len(train_clean['violator_name'][train_clean['compliance'] == 1].unique())

9613

In [33]:
people_who_pay = {}
for name,frame in train_clean.groupby('violator_name'):
    if sum(frame['compliance']) > 1:
        count = len(frame['compliance'])
        paid = sum(frame['compliance'])
        rate = paid/count
        people_who_pay[name] = rate

In [51]:
train_clean['compliance_rate'] = train_clean.violator_name.apply(good_samaritans)

In [54]:
test_df['compliance_rate'] = test_df.violator_name.apply(good_samaritans)

### Feature Exploration - Violation Code

### Convert Features to Categorical Values

In [55]:
to_categorical(train_clean,'agency_name')
to_categorical(test_df,'agency_name')

to_categorical(train_clean,'month')
to_categorical(test_df,'month')

to_categorical(train_clean,'time')
to_categorical(test_df,'time')

to_categorical(train_clean,'day')
to_categorical(test_df,'day')

to_categorical(train_clean,'compliance_rate')
to_categorical(test_df,'compliance_rate')

### Select Features for Analysis in Model

In [56]:
model_cols = ['fine_amount', 'month', 'time', 'day','zip_buckets', 'agency_name', 'time_diff', 'compliance_rate']

### Create Train/Test Splits

In [57]:
X = train_clean[model_cols]
y = train_clean['compliance']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### Instantiate and Fit Model and Calculate AUC

In [58]:
#model_list = [GradientBoostingClassifier(random_state = 0),
#              RandomForestClassifier(random_state = 0)]
#              SGDClassifier(random_state = 0),
#              SVC()]
#for model in model_list:
m = GradientBoostingClassifier(random_state = 0)
y_score_m = m.fit(X_train, y_train).decision_function(X_test)
fpr_m, tpr_m, _ = roc_curve(y_test, y_score_m)
auc(fpr_m, tpr_m)

0.79442860521988834

### Fit Model with Entire TRAIN set data and Predict Test Labels

In [59]:
m.fit(X, y)
test_features = test_df[model_cols]
y_pred_proba = m.predict_proba(test_features)

In [60]:
#m = RandomForestClassifier(random_state = 0, )
#m.fit(X, y)
#test_features = test_df[model_cols]
#y_pred_proba = m.predict_proba(test_features)

### Run predict_proba for submission

In [62]:
def blight_model():    
    y_pred = []
    for pred in y_pred_proba:
        y_pred.append(pred[1])
    y_pred = pd.Series(y_pred)
    y_pred.index = test_df.ticket_id
#    print(sum([1 for each in y_pred if each > .5]))
    return y_pred
blight_model()

2809


ticket_id
284932    0.087102
285362    0.024721
285361    0.088530
285338    0.079185
285346    0.096477
285345    0.079185
285347    0.089289
285342    0.353185
285530    0.024584
284989    0.006641
285344    0.081880
285343    0.024721
285340    0.053126
285341    0.158129
285349    0.169826
285348    0.152822
284991    0.006641
285532    0.039330
285406    0.039330
285001    0.048936
285006    0.041241
285405    0.003937
285337    0.039330
285496    0.008394
285497    0.007794
285378    0.024855
285589    0.006348
285585    0.072547
285501    0.096477
285581    0.024584
            ...   
376367    0.018582
376366    0.031341
376362    0.031341
376363    0.051671
376365    0.018582
376364    0.031341
376228    0.071077
376265    0.006950
376286    0.035054
376320    0.037292
376314    0.031341
376327    0.037292
376385    0.031341
376435    0.020304
376370    0.071077
376434    0.094902
376459    0.147210
376478    0.040519
376473    0.071077
376484    0.020304
376482    0.003319
37