In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, auc, roc_curve
import warnings
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import SGDClassifier
#from sklearn.svm import SVC
#warnings.filterwarnings("ignore")

### Import Train and Test data sets

In [3]:
#train_data_path = '../data/train.csv'
train_data_path = 'train.csv'
train_df = pd.read_csv(train_data_path, encoding = 'ISO-8859-1')
#train_df.head()

#test_data_path = '../data/test.csv'
test_data_path = 'test.csv'
test_df = pd.read_csv(test_data_path, encoding = 'ISO-8859-1')
#test_df.head()

### Remove null compliance values for y_pred submission

In [4]:
good_rows = train_df.compliance.notnull()
train_clean = train_df[good_rows]

### Helper Functions to create new features

In [185]:
def datetime_split_month(cell):
    day_time = cell.split(" ")
    month = day_time[0]
    return month

def datetime_split_time(cell):
    day_time = cell.split(" ")
    time = day_time[1]
    return time

def weekday_end(x):
    if x < 5:
        return 0
    else:
        return 1

def bucketize_zipcodes(x):
    rate = compliance_by_zip[x]
    if rate == 0.:
        return 0
    elif ((rate > 0.) & (rate <= 0.08)):
        return 1
    elif ((rate > 0.075) & (rate <= .2)):
        return 2
    elif ((rate > .2) & (rate <= 0.5)):
        return 3
    else:
        return 4

def to_categorical(df, column):
    df[column] = df[column].astype('category').cat.codes
    df[column] = df[column].astype('category')
    
def bucketize_test_zipcode(code):
    if code not in zip_bucket_dict.keys():
        value = 0
    else:
        value = zip_bucket_dict[code] 
    return value

def compliance_impact(group_name, df):
    compliance_by_ = {}
    for g,f in df.groupby(group_name):
        yes = sum(f.compliance)
        no = f.shape[0] - sum(f.compliance)
        rate = float(yes)/ f.shape[0]
        overall_rate = yes / total_compliant
        compliance_by_[g] = rate#,overall_rate
#        print('Fine Amount {}'.format(g))
#        print('Count {}'.format(f.shape[0]))
#        print('Compliant : Yes {}, No {}'.format(yes,no))
#        print('Compliance Rate {}'z.format(rate))
#        print('Overall Compliant Contribution {}'.format(overall_rate))
#        print()
    return compliance_by_

def good_samaritans(name):
    if name not in people_who_pay.keys():
        return 0
    else:
        payrate = people_who_pay[name]
        if payrate < 0.8:
            return 1
        elif 0.8 < payrate <= .2:
            return 2
        elif 0.2 < payrate <= .3:
            return 3
        elif 0.3 < payrate <= .4:
            return 4
        elif 0.4 < payrate <= .5:
            return 5
        elif 0.5 < payrate <= .6:
            return 6
        elif 0.6 < payrate <= .7:
            return 7
        elif 0.7 < payrate <= .8:
            return 8
        elif 0.8 < payrate <= .9:
            return 9
        else:
            return 10

def discount_given(x):
    if x > 0:
        return 1
    else:
        return 0
    
def disposition_groups_a(name):
    if name not in disposition_dict.keys():
        return 0
    else:
        rate = disposition_dict[name]
        if rate <= .04:
            return 0
        elif.04 < rate <= .28:
            return 1
        elif.28 < rate <= .31:
            return 2
        else:
            return 3
    
def disposition_groups_b(name):
    if name not in disposition_dict.keys():
        return 0
    else:
        rate = disposition_dict[name]
        if rate <= .04:
            return 0
        elif 0.4 < rate <= .31:
            return 1
        else:
            return 2

### Create New Features for Analysis

#### Create Month, Time and Weekday Features - TRAIN set

In [6]:
train_clean['ticket_issued_month'] = train_clean.ticket_issued_date.apply(datetime_split_month)
train_clean['ticket_issued_time'] = train_clean.ticket_issued_date.apply(datetime_split_time)
train_clean['month'] = pd.to_datetime(train_clean['ticket_issued_month']).dt.month
train_clean['day'] = pd.to_datetime(train_clean['ticket_issued_date']).dt.dayofweek
train_clean['time'] = pd.to_datetime(train_clean['ticket_issued_time']).dt.hour
train_clean['weekday'] = train_clean.day.apply(weekday_end)

#### Create Month, Time and Weekday Features - TEST set

In [7]:
test_df['ticket_issued_month'] = test_df.ticket_issued_date.apply(datetime_split_month)
test_df['ticket_issued_time'] = test_df.ticket_issued_date.apply(datetime_split_time)
test_df['month'] = pd.to_datetime(test_df['ticket_issued_month']).dt.month
test_df['day'] = pd.to_datetime(test_df['ticket_issued_date']).dt.dayofweek
test_df['time'] = pd.to_datetime(test_df['ticket_issued_time']).dt.hour
test_df['weekday'] = test_df.day.apply(weekday_end)

#### Create Zipcode Buckets Feature

In [8]:
# impute 0 as zipcode on missing value
train_clean.zip_code[train_clean.zip_code.isnull()] = 0
total_compliant = train_clean.compliance.value_counts()[1]
compliance_by_zip = compliance_impact('zip_code', train_clean)
train_clean['zip_buckets'] = train_clean.zip_code.apply(bucketize_zipcodes)


# create dictionary to hold zipcode buckets
test_df.zip_code[test_df.zip_code.isnull()] = 0
zip_bucket_dict = {}
for code, bucket in zip(train_clean.zip_code, train_clean.zip_buckets):
    zip_bucket_dict[code] = bucket
test_df['zip_buckets'] = test_df.zip_code.apply(bucketize_test_zipcode)

#### Create TimeDiff Feature which is Difference between Ticket Issue Date and Hearing Date

In [9]:
train_clean['hearing_date'] = pd.to_datetime(train_clean.hearing_date)
test_df['hearing_date'] = pd.to_datetime(test_df.hearing_date)
train_clean['ticket_issued_date'] = pd.to_datetime(train_clean.ticket_issued_date)
test_df['ticket_issued_date'] = pd.to_datetime(test_df.ticket_issued_date)

In [10]:
train_clean['time_diff'] = train_clean['hearing_date'].dt.date - train_clean['ticket_issued_date'].dt.date
train_clean['time_diff'] = pd.to_numeric(train_clean['time_diff'])

In [11]:
test_df['time_diff'] = test_df['hearing_date'].dt.date - test_df['ticket_issued_date'].dt.date
test_df['time_diff'] = pd.to_numeric(test_df['time_diff'])

### Feature Exploration - Violator Name

In [27]:
len(train_clean['violator_name'][train_clean['compliance'] == 1].unique())

11597

In [28]:
len(train_clean['violator_name'][train_clean['compliance'] == 1].unique())

9613

In [33]:
people_who_pay = {}
for name,frame in train_clean.groupby('violator_name'):
    if sum(frame['compliance']) > 1:
        count = len(frame['compliance'])
        paid = sum(frame['compliance'])
        rate = paid/count
        people_who_pay[name] = rate

In [213]:
train_clean['violator_compliance_rate'] = train_clean.violator_name.apply(good_samaritans)

In [214]:
test_df['violator_compliance_rate'] = test_df.violator_name.apply(good_samaritans)

### Feature Exploration - Discount

In [215]:
# if a discount is given, people pay!
len(train_clean.compliance[train_clean.discount_amount > 0]) / sum(train_clean.discount_amount > 0)

1.0

In [216]:
train_clean['discount'] = train_clean.discount_amount.apply(discount_given)

In [217]:
test_df['discount'] = test_df.discount_amount.apply(discount_given)

### Feature Exploration - Disposition

In [218]:
train_clean.disposition.value_counts()

Responsible by Default                138340
Responsible by Admission               13701
Responsible by Determination            7644
Responsible (Fine Waived) by Deter       195
Name: disposition, dtype: int64

In [219]:
test_df.disposition.value_counts()

Responsible by Default                51602
Responsible by Admission               4484
Responsible by Determination           4124
Responsible (Fine Waived) by Deter      781
Responsible - Compl/Adj by Default        6
Responsible - Compl/Adj by Determi        2
Responsible by Dismissal                  1
Responsible (Fine Waived) by Admis        1
Name: disposition, dtype: int64

In [220]:
disposition_dict = {}
for g,f in train_clean.groupby('disposition'):
#    temp = sum(f['compliance']), len(f['compliance']),sum(f['compliance'])/len(f['compliance'])
    temp = sum(f['compliance']/len(f['compliance']))
    disposition_dict[g] = sum(f['compliance']/len(f['compliance']))

In [221]:
disposition_dict

{'Responsible (Fine Waived) by Deter': 0.99999999999999756,
 'Responsible by Admission': 0.27501642215899197,
 'Responsible by Default': 0.038202978169725989,
 'Responsible by Determination': 0.30729984301411917}

In [304]:
train_clean['disposition_groups'] = train_clean.disposition.apply(disposition_groups_b)
#train_clean.disposition_groups[:10]
#to_categorical(train_clean, 'disposition_groups')

In [305]:
test_df['disposition_groups'] = test_df['disposition'].apply(disposition_groups_b)

### Feature Exploration - Violation Description

In [306]:
violation_descript_compliance = {}
for g,f in train_clean.groupby('violation_description'):
    total = f.shape[0]
    compliant = sum(f['compliance'])
    rate = compliant / total
    violation_descript_compliance[g] = [compliant, total, rate]

In [307]:
helpful_descriptions = {}
for k,v in violation_descript_compliance.items():
    if v[2] > 0.1:
        if v[1] > 100:
#            print(k,v)
#            print()
            helpful_descriptions[k] = v[2]

In [308]:
len(helpful_descriptions)

12

In [309]:
for k,v in helpful_descriptions.items():
    print(v)

0.101433296582
0.188976377953
0.230769230769
0.12030075188
0.1015625
0.130434782609
0.26079447323
0.132424537488
0.122974261201
0.166287015945
0.146156758803
0.131904761905


In [310]:
def viol_descr_groups(description):
    if name not in helpful_descriptions.keys():
        return 0
    else:
        rate = helpful_descriptions[description]
        if rate <= 0.15:
            return 1
        elif  .15 < rate <= .2:
            return 2
        elif .2 < rate:
            return 3

In [311]:
violation_descript_compliance_test = []
for g,f in test_df.groupby('violation_description'):
    total = f.shape[0]
    if g in helpful_descriptions:
        print(g)
        violation_descript_compliance_test.append(g)

Bulk solid waste deposited more than 24 hours before designated time
Failing to secure City or Private solid waste collection containers and services
Failure of owner to remove graffiti or maintain or restore property free of graffiti.
Failure to maintain accessory structure(s) one-or two- family dwelling or commercial building
Failure to maintain exterior of one- or two-family dwelling, building, premises or commercial structure in good repair, structurally sound or in a sanitary condition to prevent threat to the public health, safety or welfare
Failure to remove animal waste on public/private property
Failure to secure City or Private solid waste collection containers and services
Improper placement of Courville container between collections
Inoperable motor vehicle(s) one- or two-family dwelling or commercial building
Open Storage/ Residential/ Inoperable Vehicles (R1)
Removal of snow and ice from sidewalks
Violation of time limit for approved containers to remain at curbside - ear

In [312]:
len(violation_descript_compliance_test)

12

In [313]:
train_clean['description_group'] = train_clean['violation_description'].apply(viol_descr_groups)
test_df['description_group'] = test_df['violation_description'].apply(viol_descr_groups)

### Convert Features to Categorical Values

In [314]:
to_categorical(train_clean,'agency_name')
to_categorical(test_df,'agency_name')

to_categorical(train_clean,'month')
to_categorical(test_df,'month')

to_categorical(train_clean,'time')
to_categorical(test_df,'time')

to_categorical(train_clean,'weekday')
to_categorical(test_df,'weekday')

to_categorical(train_clean,'violator_compliance_rate')
to_categorical(test_df,'violator_compliance_rate')

to_categorical(train_clean,'discount')
to_categorical(test_df,'discount')

to_categorical(train_clean, 'disposition_groups')
to_categorical(test_df,'disposition_groups')

to_categorical(train_clean, 'description_group')
to_categorical(test_df,'description_group')

### Select Features for Analysis in Model

In [315]:
model_cols = ['fine_amount', 
              'month', 
              'time',
              'zip_buckets', 
              'agency_name',
              'weekday',
              'time_diff', 
              'violator_compliance_rate',
              'discount',
              'disposition_groups']
#              'description_group']

### Create Train/Test Splits

In [316]:
X = train_clean[model_cols]
y = train_clean['compliance']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### Instantiate and Fit Model and Calculate AUC

In [317]:
# 0.87015480691633051 - a 
# 0.87023936847423511 - b
#model_list = [GradientBoostinagClassifier(random_state = 0),
#              RandomForestClassifier(random_state = 0)]
#              SGDClassifier(random_state = 0),
#              SVC()]
#for model in model_list:
m = GradientBoostingClassifier(random_state = 0)
y_score_m = m.fit(X_train, y_train).decision_function(X_test)
fpr_m, tpr_m, _ = roc_curve(y_test, y_score_m)
auc(fpr_m, tpr_m)

0.87023936847423511

### Fit Model with Entire TRAIN set data and Predict Test Labels

In [318]:
m.fit(X, y)
test_features = test_df[model_cols]
y_pred_proba = m.predict_proba(test_features)

In [319]:
#m = RandomForestClassifier(random_state = 0, )
#m.fit(X, y)
#test_features = test_df[model_cols]
#y_pred_proba = m.predict_proba(test_features)

### Run predict_proba for submission

In [320]:
def blight_model():    
    y_pred = []
    for pred in y_pred_proba:
        y_pred.append(pred[1])
    y_pred = pd.Series(y_pred)
    y_pred.index = test_df.ticket_id
    print(sum([1 for each in y_pred if each > .5]))
    return y_pred
blight_model()

2071


ticket_id
284932    0.042370
285362    0.012356
285361    0.051552
285338    0.042036
285346    0.053238
285345    0.042036
285347    0.053238
285342    0.519342
285530    0.012356
284989    0.004115
285344    0.051552
285343    0.012356
285340    0.023020
285341    0.089090
285349    0.089090
285348    0.077451
284991    0.004115
285532    0.017632
285406    0.017632
285001    0.026561
285006    0.019958
285405    0.002983
285337    0.017632
285496    0.006359
285497    0.005141
285378    0.012356
285589    0.004081
285585    0.040690
285501    0.053238
285581    0.012356
            ...   
376367    0.012798
376366    0.018855
376362    0.164588
376363    0.171878
376365    0.012798
376364    0.018855
376228    0.033962
376265    0.004458
376286    0.100823
376320    0.019493
376314    0.018855
376327    0.169311
376385    0.164588
376435    0.134819
376370    0.226102
376434    0.042347
376459    0.072427
376478    0.020145
376473    0.033962
376484    0.012903
376482    0.003168
37