In [30]:
from __future__ import division
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
# from tqdm import tqdm

rcParams['figure.figsize'] = 20, 5
warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")

from scipy.sparse import csr_matrix
from zipcode_mapping import zipcode_mapping

In [31]:
df = pd.read_pickle('../data/data_w_transformed_census_and_removed_invalid_rows_and_cols_and_fixed_zips_and_descs_and_dropped_latlongs.pkl')
df.shape

(744372, 37)

# Preprocessing

In [17]:
df.Property_Type = df.Property_Type.fillna('other')
df.Property_ID = df.Property_ID.fillna('other')

In [18]:
cols_to_drop = [
    'CASE_ENQUIRY_ID',
    'OPEN_DT',
    'TARGET_DT',
    'CLOSED_DT',
    'CASE_TITLE',
    'SUBJECT',
    'neighborhood',
    'LOCATION_STREET_NAME',
    'LATITUDE',
    'LONGITUDE',
    'tract_and_block_group',
    'is_issue_unresolved'
]

In [19]:
new_df = df.drop(cols_to_drop, axis=1)

In [20]:
new_df.head(1).T

Unnamed: 0,905223
REASON,Enforcement & Abandoned Vehicles
TYPE,Parking Enforcement
Department,BTDT
SubmittedPhoto,False
Property_Type,Address
Property_ID,43071
Source,Citizens Connect App
race_white,0.550562
race_black,0.152118
race_asian,0


## Feature Engineering

## Dummifying

In [21]:
def dummify_cols_and_baselines(df, cols):
    baseline_cols = []
    
    for i, column in enumerate(cols):
        baseline = sorted(df[column].unique())[-1]
        print baseline, 'is baseline', i, len(cols)
        baseline_cols += [baseline]
        dummy = pd.get_dummies(df[column]).rename(columns=lambda x: column+'_'+str(x)).iloc[:,0:len(df[column].unique())-1]
        df = df.drop(column, axis=1) #Why not inplace? because if we do inplace, it will affect the df directly
        df = pd.concat([df, dummy], axis=1)
        
    return df, baseline_cols

In [25]:
def dummify(df, column):
    # from Darren's linear regression slides
    print '{} is your baseline'.format(sorted(df[column].unique())[-1])
    dummy = pd.get_dummies(df[column]).rename(columns=lambda x: column+'_'+str(x)).iloc[:,0:len(df[column].unique())-1]
    df = df.drop(column,axis=1) #Why not inplace? because if we do inplace, it will affect the df directly
    return pd.concat([df,dummy],axis=1)

In [32]:
df.school.drop_duplicates()

905223                 20_bachelors
905346                15_hs_diploma
905604    18_some_college_no_degree
905264                19_associates
905398                   21_masters
905369                13_11th_grade
905117     14_12th_grade_no_diploma
903543                  8_6th_grade
903909                       0_none
902859       22_professional_school
902313                 11_9th_grade
Name: school, dtype: object

In [28]:
dummify(df, 'school')

8_6th_grade is your baseline


Unnamed: 0,CASE_ENQUIRY_ID,OPEN_DT,TARGET_DT,CLOSED_DT,CASE_TITLE,SUBJECT,REASON,TYPE,Department,SubmittedPhoto,...,school_0_none,school_11_9th_grade,school_13_11th_grade,school_14_12th_grade_no_diploma,school_15_hs_diploma,school_18_some_college_no_degree,school_19_associates,school_20_bachelors,school_21_masters,school_22_professional_school
905223,101001983804,2017-01-08 12:00:16,2017-01-10 08:30:00,NaT,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT,False,...,0,0,0,0,0,0,0,1,0,0
905346,101001983803,2017-01-07 11:50:00,NaT,NaT,Police: Other,Mayor's 24 Hour Hotline,Notification,Notification,INFO,False,...,0,0,0,0,1,0,0,0,0,0
905302,101001983802,2017-01-07 11:45:00,2017-01-11 08:30:00,NaT,Request for Snow Plowing,Public Works Department,Street Cleaning,Request for Snow Plowing,PWDx,False,...,0,0,0,0,1,0,0,0,0,0
905507,101001983801,2017-01-07 11:40:33,2017-01-11 08:30:00,NaT,Request for Snow Plowing,Public Works Department,Street Cleaning,Request for Snow Plowing,PWDx,False,...,0,0,0,0,0,0,0,1,0,0
905217,101001983800,2017-01-07 11:27:00,2017-01-11 08:30:00,NaT,Request for Snow Plowing,Public Works Department,Street Cleaning,Request for Snow Plowing,PWDx,False,...,0,0,0,0,0,0,0,1,0,0
905594,101001983798,2017-01-07 11:20:00,NaT,NaT,Misc. Snow Complaint,Public Works Department,Administrative & General Requests,Misc. Snow Complaint,PARK,False,...,0,0,0,0,0,0,0,1,0,0
905458,101001983797,2017-01-07 11:14:02,2017-01-10 08:30:00,NaT,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT,True,...,0,0,0,0,0,0,0,1,0,0
905543,101001983795,2017-01-07 11:12:10,2017-01-11 08:30:00,NaT,Request for Snow Plowing,Public Works Department,Street Cleaning,Request for Snow Plowing,PWDx,False,...,0,0,0,0,0,0,0,1,0,0
905604,101001983793,2017-01-07 11:10:15,2017-01-26 08:30:00,NaT,Abandoned Vehicles,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Abandoned Vehicles,BTDT,True,...,0,0,0,0,0,1,0,0,0,0
905488,101001983792,2017-01-07 11:08:05,2017-01-10 08:30:00,NaT,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT,True,...,0,0,0,0,1,0,0,0,0,0


In [23]:
cols_to_dummify = new_df.dtypes[new_df.dtypes == object].index
cols_to_dummify

Index([u'REASON', u'TYPE', u'Department', u'Property_Type', u'Property_ID',
       u'Source', u'school', u'housing', u'neighborhood_from_zip'],
      dtype='object')

In [None]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

Weights and Measures is baseline 0 9
Zoning is baseline 1 9
PWDx is baseline 2 9
other is baseline 3 9
other is baseline 4 9


## Run model

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer

In [None]:
log_transform = FunctionTransformer(pd.np.log10)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('COMPLETION_TIME', axis=1), 
    pd.np.log10(df_dummified.COMPLETION_TIME), 
    test_size=0.2, 
    random_state=300
)

In [None]:
pipe = make_pipeline(LassoCV(verbose=100))
# cv = ShuffleSplit(X_train.shape[0], n_iter=5, test_size=0.2, random_state=300)

In [None]:
params = {}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=5, verbose=100)
model.fit(aa, y_train);