In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, auc, roc_curve
import warnings
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import SGDClassifier
#from sklearn.svm import SVC
#warnings.filterwarnings("ignore")

### Import Train and Test data sets

In [3]:
#train_data_path = '../data/train.csv'
train_data_path = 'train.csv'
train_df = pd.read_csv(train_data_path, encoding = 'ISO-8859-1')
#train_df.head()

#test_data_path = '../data/test.csv'
test_data_path = 'test.csv'
test_df = pd.read_csv(test_data_path, encoding = 'ISO-8859-1')
#test_df.head()

### Remove null compliance values for y_pred submission

In [4]:
good_rows = train_df.compliance.notnull()
train_clean = train_df[good_rows]

### Helper Functions to create new features

In [185]:
def datetime_split_month(cell):
    day_time = cell.split(" ")
    month = day_time[0]
    return month

def datetime_split_time(cell):
    day_time = cell.split(" ")
    time = day_time[1]
    return time

def weekday_end(x):
    if x < 5:
        return 0
    else:
        return 1

def bucketize_zipcodes(x):
    rate = compliance_by_zip[x]
    if rate == 0.:
        return 0
    elif ((rate > 0.) & (rate <= 0.08)):
        return 1
    elif ((rate > 0.075) & (rate <= .2)):
        return 2
    elif ((rate > .2) & (rate <= 0.5)):
        return 3
    else:
        return 4

def to_categorical(df, column):
    df[column] = df[column].astype('category').cat.codes
    df[column] = df[column].astype('category')
    
def bucketize_test_zipcode(code):
    if code not in zip_bucket_dict.keys():
        value = 0
    else:
        value = zip_bucket_dict[code] 
    return value

def compliance_impact(group_name, df):
    compliance_by_ = {}
    for g,f in df.groupby(group_name):
        yes = sum(f.compliance)
        no = f.shape[0] - sum(f.compliance)
        rate = float(yes)/ f.shape[0]
        overall_rate = yes / total_compliant
        compliance_by_[g] = rate#,overall_rate
#        print('Fine Amount {}'.format(g))
#        print('Count {}'.format(f.shape[0]))
#        print('Compliant : Yes {}, No {}'.format(yes,no))
#        print('Compliance Rate {}'z.format(rate))
#        print('Overall Compliant Contribution {}'.format(overall_rate))
#        print()
    return compliance_by_

def good_samaritans(name):
    if name not in people_who_pay.keys():
        return 0
    else:
        payrate = people_who_pay[name]
        if payrate < 0.8:
            return 1
        elif 0.8 < payrate <= .2:
            return 2
        elif 0.2 < payrate <= .3:
            return 3
        elif 0.3 < payrate <= .4:
            return 4
        elif 0.4 < payrate <= .5:
            return 5
        elif 0.5 < payrate <= .6:
            return 6
        elif 0.6 < payrate <= .7:
            return 7
        elif 0.7 < payrate <= .8:
            return 8
        elif 0.8 < payrate <= .9:
            return 9
        else:
            return 10

def discount_given(x):
    if x > 0:
        return 1
    else:
        return 0
    
def disposition_groups_a(name):
    if name not in disposition_dict.keys():
        return 0
    else:
        rate = disposition_dict[name]
        if rate <= .04:
            return 0
        elif.04 < rate <= .28:
            return 1
        elif.28 < rate <= .31:
            return 2
        else:
            return 3
    
def disposition_groups_b(name):
    if name not in disposition_dict.keys():
        return 0
    else:
        rate = disposition_dict[name]
        if rate <= .04:
            return 0
        elif 0.4 < rate <= .31:
            return 1
        else:
            return 2

### Create New Features for Analysis

#### Create Month, Time and Weekday Features - TRAIN set

In [6]:
train_clean['ticket_issued_month'] = train_clean.ticket_issued_date.apply(datetime_split_month)
train_clean['ticket_issued_time'] = train_clean.ticket_issued_date.apply(datetime_split_time)
train_clean['month'] = pd.to_datetime(train_clean['ticket_issued_month']).dt.month
train_clean['day'] = pd.to_datetime(train_clean['ticket_issued_date']).dt.dayofweek
train_clean['time'] = pd.to_datetime(train_clean['ticket_issued_time']).dt.hour
train_clean['weekday'] = train_clean.day.apply(weekday_end)

#### Create Month, Time and Weekday Features - TEST set

In [7]:
test_df['ticket_issued_month'] = test_df.ticket_issued_date.apply(datetime_split_month)
test_df['ticket_issued_time'] = test_df.ticket_issued_date.apply(datetime_split_time)
test_df['month'] = pd.to_datetime(test_df['ticket_issued_month']).dt.month
test_df['day'] = pd.to_datetime(test_df['ticket_issued_date']).dt.dayofweek
test_df['time'] = pd.to_datetime(test_df['ticket_issued_time']).dt.hour
test_df['weekday'] = test_df.day.apply(weekday_end)

#### Create Zipcode Buckets Feature

In [8]:
# impute 0 as zipcode on missing value
train_clean.zip_code[train_clean.zip_code.isnull()] = 0
total_compliant = train_clean.compliance.value_counts()[1]
compliance_by_zip = compliance_impact('zip_code', train_clean)
train_clean['zip_buckets'] = train_clean.zip_code.apply(bucketize_zipcodes)


# create dictionary to hold zipcode buckets
test_df.zip_code[test_df.zip_code.isnull()] = 0
zip_bucket_dict = {}
for code, bucket in zip(train_clean.zip_code, train_clean.zip_buckets):
    zip_bucket_dict[code] = bucket
test_df['zip_buckets'] = test_df.zip_code.apply(bucketize_test_zipcode)

#### Create TimeDiff Feature which is Difference between Ticket Issue Date and Hearing Date

In [9]:
train_clean['hearing_date'] = pd.to_datetime(train_clean.hearing_date)
test_df['hearing_date'] = pd.to_datetime(test_df.hearing_date)
train_clean['ticket_issued_date'] = pd.to_datetime(train_clean.ticket_issued_date)
test_df['ticket_issued_date'] = pd.to_datetime(test_df.ticket_issued_date)

In [10]:
train_clean['time_diff'] = train_clean['hearing_date'].dt.date - train_clean['ticket_issued_date'].dt.date
train_clean['time_diff'] = pd.to_numeric(train_clean['time_diff'])

In [11]:
test_df['time_diff'] = test_df['hearing_date'].dt.date - test_df['ticket_issued_date'].dt.date
test_df['time_diff'] = pd.to_numeric(test_df['time_diff'])

### Feature Exploration - Violator Name

In [27]:
len(train_clean['violator_name'][train_clean['compliance'] == 1].unique())

11597

In [28]:
len(train_clean['violator_name'][train_clean['compliance'] == 1].unique())

9613

In [33]:
people_who_pay = {}
for name,frame in train_clean.groupby('violator_name'):
    if sum(frame['compliance']) > 1:
        count = len(frame['compliance'])
        paid = sum(frame['compliance'])
        rate = paid/count
        people_who_pay[name] = rate

In [51]:
train_clean['compliance_rate'] = train_clean.violator_name.apply(good_samaritans)

In [54]:
test_df['compliance_rate'] = test_df.violator_name.apply(good_samaritans)

### Feature Exploration - Discount

In [77]:
# if a discount is given, people pay!
len(train_clean.compliance[train_clean.discount_amount > 0]) / sum(train_clean.discount_amount > 0)

1.0

In [80]:
train_clean['discount'] = train_clean.discount_amount.apply(discount_given)

In [81]:
test_df['discount'] = test_df.discount_amount.apply(discount_given)

### Feature Exploration - Disposition

In [129]:
train_clean.disposition.value_counts()

Responsible by Default                138340
Responsible by Admission               13701
Responsible by Determination            7644
Responsible (Fine Waived) by Deter       195
Name: disposition, dtype: int64

In [125]:
test_df.disposition.value_counts()

Responsible by Default                51602
Responsible by Admission               4484
Responsible by Determination           4124
Responsible (Fine Waived) by Deter      781
Responsible - Compl/Adj by Default        6
Responsible - Compl/Adj by Determi        2
Responsible by Dismissal                  1
Responsible (Fine Waived) by Admis        1
Name: disposition, dtype: int64

In [170]:
disposition_dict = {}
for g,f in train_clean.groupby('disposition'):
#    temp = sum(f['compliance']), len(f['compliance']),sum(f['compliance'])/len(f['compliance'])
    temp = sum(f['compliance']/len(f['compliance']))
    disposition_dict[g] = sum(f['compliance']/len(f['compliance']))

In [187]:
disposition_dict

{'Responsible (Fine Waived) by Deter': 0.99999999999999756,
 'Responsible by Admission': 0.27501642215899197,
 'Responsible by Default': 0.038202978169725989,
 'Responsible by Determination': 0.30729984301411917,
 'other': 0.0}

In [175]:
train_clean['disposition_groups'] = train_clean.disposition.apply(disposition_groups_a)
train_clean.disposition_groups[:10]
#to_categorical(train_clean, 'disposition_groups')

0     0
1     2
5     0
6     0
7     0
8     0
9     0
12    0
13    0
14    0
Name: disposition_groups, dtype: int64

In [184]:
test_df['disposition_groups'] = test_df['disposition'].apply(disposition_groups_a)

### Convert Features to Categorical Values

In [188]:
to_categorical(train_clean,'agency_name')
to_categorical(test_df,'agency_name')

to_categorical(train_clean,'month')
to_categorical(test_df,'month')

to_categorical(train_clean,'time')
to_categorical(test_df,'time')

to_categorical(train_clean,'day')
to_categorical(test_df,'day')

to_categorical(train_clean,'compliance_rate')
to_categorical(test_df,'compliance_rate')

to_categorical(train_clean,'discount')
to_categorical(test_df,'discount')

to_categorical(train_clean, 'disposition_groups')
to_categorical(test_df,'disposition_groups')

### Select Features for Analysis in Model

In [189]:
model_cols = ['fine_amount', 
              'month', 
              'time',
              'zip_buckets', 
              'agency_name', 
              'time_diff', 
              'compliance_rate',
              'discount',
              'disposition_groups']

### Create Train/Test Splits

In [190]:
X = train_clean[model_cols]
y = train_clean['compliance']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### Instantiate and Fit Model and Calculate AUC

In [191]:
#model_list = [GradientBoostingClassifier(random_state = 0),
#              RandomForestClassifier(random_state = 0)]
#              SGDClassifier(random_state = 0),
#              SVC()]
#for model in model_list:
m = GradientBoostingClassifier(random_state = 0)
y_score_m = m.fit(X_train, y_train).decision_function(X_test)
fpr_m, tpr_m, _ = roc_curve(y_test, y_score_m)
auc(fpr_m, tpr_m)

0.87015480691633051

### Fit Model with Entire TRAIN set data and Predict Test Labels

In [192]:
m.fit(X, y)
test_features = test_df[model_cols]
y_pred_proba = m.predict_proba(test_features)

In [88]:
#m = RandomForestClassifier(random_state = 0, )
#m.fit(X, y)
#test_features = test_df[model_cols]
#y_pred_proba = m.predict_proba(test_features)

### Run predict_proba for submission

In [193]:
def blight_model():    
    y_pred = []
    for pred in y_pred_proba:
        y_pred.append(pred[1])
    y_pred = pd.Series(y_pred)
    y_pred.index = test_df.ticket_id
#    print(sum([1 for each in y_pred if each > .5]))
    return y_pred
blight_model()

2021


ticket_id
284932    0.040430
285362    0.013168
285361    0.047942
285338    0.039979
285346    0.048314
285345    0.039979
285347    0.048314
285342    0.708788
285530    0.013290
284989    0.003951
285344    0.047942
285343    0.013168
285340    0.023141
285341    0.077805
285349    0.077805
285348    0.069803
284991    0.003951
285532    0.018433
285406    0.018433
285001    0.026142
285006    0.021525
285405    0.002811
285337    0.018433
285496    0.007768
285497    0.006799
285378    0.013290
285589    0.003951
285585    0.039669
285501    0.048314
285581    0.013290
            ...   
376367    0.012975
376366    0.018616
376362    0.221004
376363    0.245773
376365    0.012975
376364    0.018616
376228    0.032603
376265    0.004259
376286    0.098477
376320    0.019201
376314    0.018616
376327    0.138779
376385    0.135051
376435    0.151556
376370    0.307566
376434    0.041290
376459    0.066307
376478    0.020944
376473    0.032603
376484    0.013095
376482    0.002973
37