## Objective

Use our subsetted features, make model w sklearn and pickle it.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('.'), "../preprocessing"))

In [5]:
from __future__ import division
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
import string
from StringIO import StringIO

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")
rcParams['figure.figsize'] = 20, 5

from helper_functions import dummify_cols_and_baselines, make_alphas, remove_outliers_by_type

In [6]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

## Removing outliers

A standard procedure is to remove values further than 3 standard deviations from the mean. Since I have so many low values and some very high values, I anecdotally think that the low values are very likely to be true, and the high values not so much.

So, I will remove values further than 3 SDs from the median, by type.

Ideally, I would take into account the time dimension. I would like to do so given more time.

In [7]:
df_outliers_removed = remove_outliers_by_type(df_orig, y_col='COMPLETION_HOURS_LOG_10')
df_outliers_removed.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._where(-key, value, inplace=True)


(508653, 40)

I'm removing ~1.5% of my rows.

## Choosing columns

In [8]:
cols_orig_dataset = ['COMPLETION_HOURS_LOG_10', 'TYPE', 'SubmittedPhoto', 'Property_Type', 'Source', 'neighborhood_from_zip']
cols_census = ['race_white',
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other',
     'poverty_pop_below_poverty_level',
     'earned_income_per_capita',
     'poverty_pop_w_public_assistance',
     'poverty_pop_w_food_stamps',
     'poverty_pop_w_ssi',
     'school',
     'school_std_dev',
     'housing',
     'housing_std_dev',
     'bedroom',
     'bedroom_std_dev',
     'value',
     'value_std_dev',
     'rent',
     'rent_std_dev',
     'income',
     'income_std_dev']
cols_engineered = ['queue_wk', 'queue_wk_open', 'is_description']

In [9]:
df = df_outliers_removed[cols_orig_dataset + cols_census + cols_engineered]

## Dummify

In [10]:
cols_to_dummify = df.dtypes[df.dtypes == object].index
cols_to_dummify

Index([u'TYPE', u'Property_Type', u'Source', u'neighborhood_from_zip',
       u'school', u'housing'],
      dtype='object')

In [11]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

Zoning is baseline 0 6
other is baseline 1 6
Twitter is baseline 2 6
West Roxbury is baseline 3 6
8_6th_grade is baseline 4 6
rent is baseline 5 6


In [12]:
df_dummified.shape

(508653, 253)

## Removing columns as per L2 results

In [13]:
col_blacklist = ['Property_Type_Address',
 'Property_Type_Intersection',
 'Source_Constituent Call',
 'SubmittedPhoto',
 'TYPE_ADA',
 'TYPE_Alert Boston',
 'TYPE_Animal Noise Disturbances',
 'TYPE_Automotive Noise Disturbance',
 'TYPE_BWSC General Request',
 'TYPE_BWSC Pothole',
 'TYPE_Big Buildings Online Request',
 'TYPE_Billing Complaint',
 'TYPE_Bridge Maintenance',
 'TYPE_CE Collection',
 'TYPE_Cemetery Maintenance Request',
 'TYPE_City/State Snow Issues',
 'TYPE_Contractor Complaints',
 'TYPE_Corporate or Community Group Service Day Clean Up',
 'TYPE_Downed Wire',
 'TYPE_Dumpster & Loading Noise Disturbances',
 'TYPE_Fire Department Request',
 'TYPE_Fire Hydrant',
 'TYPE_Fire in Food Establishment',
 'TYPE_Follow-Up',
 'TYPE_Food Alert - Confirmed',
 'TYPE_Food Alert - Unconfirmed',
 'TYPE_General Traffic Engineering Request',
 'TYPE_Ground Maintenance',
 'TYPE_HP Sign Application New',
 'TYPE_HP Sign Application Renewal',
 'TYPE_Heat/Fuel Assistance',
 'TYPE_Idea Collection',
 'TYPE_Knockdown Replacement',
 'TYPE_Loud Parties/Music/People',
 'TYPE_Mechanical',
 'TYPE_Misc. Snow Complaint',
 'TYPE_Mosquitoes (West Nile)',
 'TYPE_Municipal Parking Lot Complaints',
 'TYPE_New Tree Warrantee Inspection',
 'TYPE_News Boxes',
 'TYPE_No Utilities - Food Establishment - Electricity',
 'TYPE_No Utilities - Food Establishment - Flood',
 'TYPE_No Utilities - Food Establishment - Sewer',
 'TYPE_No Utilities - Food Establishment - Water',
 'TYPE_No Utilities Residential - Electricity',
 'TYPE_No Utilities Residential - Gas',
 'TYPE_No Utilities Residential - Water',
 'TYPE_OCR Metrolist',
 'TYPE_Occupying W/Out A Valid CO/CI',
 'TYPE_One Boston Day',
 'TYPE_PWD Graffiti',
 'TYPE_Parking Meter Repairs',
 'TYPE_Parks General Request',
 'TYPE_Pavement Marking Inspection',
 'TYPE_Phone Bank Service Inquiry',
 'TYPE_Planting',
 'TYPE_Poor Ventilation',
 'TYPE_Private Parking Lot Complaints',
 'TYPE_Public Events Noise Disturbances',
 'TYPE_Rat Bite',
 'TYPE_Rental Unit Delivery Conditions',
 'TYPE_Request for Litter Basket Installation',
 'TYPE_Roadway Flooding',
 'TYPE_Rooftop & Mechanical Disturbances',
 'TYPE_Schedule a Bulk Item Pickup SS',
 'TYPE_Senior Shoveling',
 'TYPE_Sewage/Septic Back-Up',
 'TYPE_Sidewalk Cover / Manhole',
 'TYPE_Sidewalk Repair (Make Safe)',
 'TYPE_Sign Shop WO',
 'TYPE_Snow Removal',
 'TYPE_Snow/Ice Control',
 'TYPE_Student Overcrowding',
 'TYPE_Transfer Not Completed',
 'TYPE_Undefined Noise Disturbance',
 'TYPE_Unit Pricing Wrong/Missing',
 'TYPE_Unsanitary Conditions - Employees',
 'TYPE_Unsanitary Conditions - Establishment',
 'TYPE_Unsanitary Conditions - Food',
 'TYPE_Utility Casting Repair',
 'TYPE_Valet Parking Problems',
 'TYPE_Walk-In Service Inquiry',
 'TYPE_Watermain Break',
 'TYPE_Work Hours-Loud Noise Complaints',
 'TYPE_Yardwaste Asian Longhorned Beetle Affected Area',
 'bedroom',
 'bedroom_std_dev',
 'earned_income_per_capita',
 'housing_own',
 'housing_std_dev',
 'income',
 'income_std_dev',
 'is_description',
 'neighborhood_from_zip_Allston / Brighton',
 'neighborhood_from_zip_Back Bay',
 'neighborhood_from_zip_Beacon Hill',
 'neighborhood_from_zip_Brookline',
 'neighborhood_from_zip_Charlestown',
 'neighborhood_from_zip_Chestnut Hill',
 'neighborhood_from_zip_Dorchester',
 'neighborhood_from_zip_Downtown / Financial District',
 'neighborhood_from_zip_Fenway / Kenmore / Audubon Circle / Longwood',
 'neighborhood_from_zip_Hyde Park',
 'neighborhood_from_zip_Jamaica Plain',
 'neighborhood_from_zip_Mattapan',
 'neighborhood_from_zip_Mission Hill',
 'neighborhood_from_zip_Roslindale',
 'neighborhood_from_zip_Roxbury',
 'neighborhood_from_zip_South Boston',
 'neighborhood_from_zip_South Boston / South Boston Waterfront',
 'neighborhood_from_zip_South End',
 'neighborhood_from_zip_West End',
 'poverty_pop_below_poverty_level',
 'poverty_pop_w_food_stamps',
 'poverty_pop_w_public_assistance',
 'poverty_pop_w_ssi',
 'queue_wk',
 'race_asian',
 'race_black',
 'race_hispanic',
 'race_other',
 'race_white',
 'rent',
 'rent_std_dev',
 'school_0_none',
 'school_11_9th_grade',
 'school_13_11th_grade',
 'school_14_12th_grade_no_diploma',
 'school_15_hs_diploma',
 'school_18_some_college_no_degree',
 'school_19_associates',
 'school_20_bachelors',
 'school_21_masters',
 'school_22_professional_school',
 'school_std_dev',
 'value',
 'value_std_dev']

In [14]:
df_dummified_and_filtered = df_dummified.drop(col_blacklist, axis=1)

In [33]:
df_dummified_and_filtered.shape

(508653, 116)

## Running a model

In [15]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score




In [16]:
pipe = make_pipeline(StandardScaler(), LinearRegression())

In [34]:
X = df_dummified_and_filtered.drop('COMPLETION_HOURS_LOG_10', axis=1)
y = df_dummified_and_filtered.COMPLETION_HOURS_LOG_10

In [35]:
pipe.fit(X, y);

## Pickling model

In [36]:
from sklearn.externals import joblib

In [37]:
joblib.dump(pipe, '../data/model_completion_time.pkl')

['../data/model_completion_time.pkl']

In [38]:
aa = joblib.load('../data/model_completion_time.pkl')

In [32]:
X.head(1)

Unnamed: 0,SubmittedPhoto,race_white,race_black,race_asian,race_hispanic,race_other,poverty_pop_below_poverty_level,earned_income_per_capita,poverty_pop_w_public_assistance,poverty_pop_w_food_stamps,...,school_11_9th_grade,school_13_11th_grade,school_14_12th_grade_no_diploma,school_15_hs_diploma,school_18_some_college_no_degree,school_19_associates,school_20_bachelors,school_21_masters,school_22_professional_school,housing_own
905425,True,0.242399,0.514358,0.035473,0.067568,0.140203,0.0,34340.0,0.059748,0.138365,...,0,0,0,1,0,0,0,0,0,1


In [31]:
pd.DataFrame(sample_row)

Unnamed: 0,SourceCitizensConnectApp,SourceSelfService,TYPEAbandonedBicycle,TYPEAbandonedBuilding,TYPEAbandonedVehicles,TYPEAnimalFound,TYPEAnimalGenericRequest,TYPEAnimalLost,TYPEBedBugs,TYPEBicycleIssues,...,TYPEUnshoveledSidewalk,TYPEUpgradeExistingLighting,TYPEUtilityCallIn,TYPEWCCallLog,TYPEWaterinGasHighPriority,TYPEWorkingBeyondHours,TYPEWorkwoutPermit,neighborhoodfromzipEastBoston,neighborhoodfromzipNorthEnd,queuewkopen
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,351


In [45]:
aa.predict(_31)

array([-21.81364741])

In [47]:
X.head(1).to_dict('records')

[{'Source_Citizens Connect App': 1,
  'Source_Self Service': 0,
  'TYPE_Abandoned Bicycle': 0,
  'TYPE_Abandoned Building': 0,
  'TYPE_Abandoned Vehicles': 0,
  'TYPE_Animal Found': 0,
  'TYPE_Animal Generic Request': 0,
  'TYPE_Animal Lost': 0,
  'TYPE_Bed Bugs': 0,
  'TYPE_Bicycle Issues': 0,
  'TYPE_Breathe Easy': 0,
  'TYPE_Building Inspection Request': 0,
  'TYPE_Call Log': 0,
  'TYPE_Carbon Monoxide': 0,
  'TYPE_Catchbasin': 0,
  'TYPE_Checkin': 0,
  'TYPE_Chronic Dampness/Mold': 0,
  'TYPE_Construction Debris': 0,
  'TYPE_Contractors Complaint': 0,
  'TYPE_Cross Metering - Sub-Metering': 0,
  'TYPE_Egress': 0,
  'TYPE_Electrical': 0,
  'TYPE_Empty Litter Basket': 0,
  'TYPE_Equipment Repair': 0,
  'TYPE_Exceeding Terms of Permit': 0,
  'TYPE_General Comments For An Employee': 0,
  'TYPE_General Comments For a Program or Policy': 0,
  'TYPE_General Lighting Request': 0,
  'TYPE_Graffiti Removal': 0,
  'TYPE_Heat - Excessive  Insufficient': 0,
  'TYPE_Highway Maintenance': 0,
  'T

In [44]:
aa.predict(X.head(1))

array([ 1.25687211])

In [21]:
pipe.predict(X.head(1))

array([ 1.28505512])

In [28]:
sample_row = [{'SourceCitizensConnectApp': 1,
  'SourceSelfService': 0,
  'TYPEAbandonedBicycle': 0,
  'TYPEAbandonedBuilding': 0,
  'TYPEAbandonedVehicles': 0,
  'TYPEAnimalFound': 0,
  'TYPEAnimalGenericRequest': 0,
  'TYPEAnimalLost': 0,
  'TYPEBedBugs': 0,
  'TYPEBicycleIssues': 0,
  'TYPEBreatheEasy': 0,
  'TYPEBuildingInspectionRequest': 0,
  'TYPECallLog': 0,
  'TYPECarbonMonoxide': 0,
  'TYPECatchbasin': 0,
  'TYPECheckin': 0,
  'TYPEChronicDampnessMold': 0,
  'TYPEConstructionDebris': 0,
  'TYPEContractorsComplaint': 0,
  'TYPECrossMeteringSubMetering': 0,
  'TYPEEgress': 0,
  'TYPEElectrical': 0,
  'TYPEEmptyLitterBasket': 0,
  'TYPEEquipmentRepair': 0,
  'TYPEExceedingTermsofPermit': 0,
  'TYPEGeneralCommentsForAnEmployee': 0,
  'TYPEGeneralCommentsForaProgramorPolicy': 0,
  'TYPEGeneralLightingRequest': 0,
  'TYPEGraffitiRemoval': 0,
  'TYPEHeatExcessiveInsufficient': 0,
  'TYPEHighwayMaintenance': 0,
  'TYPEHousingDiscriminationIntakeForm': 0,
  'TYPEIllegalAutoBodyShop': 0,
  'TYPEIllegalDumping': 0,
  'TYPEIllegalOccupancy': 0,
  'TYPEIllegalPostingofSigns': 0,
  'TYPEIllegalRoomingHouse': 0,
  'TYPEIllegalUse': 0,
  'TYPEIllegalVending': 0,
  'TYPEImproperStorageofTrashBarrels': 0,
  'TYPEInstallNewLighting': 0,
  'TYPEItemPriceMissing': 0,
  'TYPELead': 0,
  'TYPELitterBasketMaintenance': 0,
  'TYPEMaintenanceComplaintResidential': 0,
  'TYPEMaintenanceHomeowner': 0,
  'TYPEMajorSystemFailure': 0,
  'TYPEMiceInfestationResidential': 0,
  'TYPEMissedTrashRecyclingYardWasteBulkItem': 0,
  'TYPEMissingSign': 0,
  'TYPENeedlePickup': 0,
  'TYPENewSignCrosswalkorPavementMarking': 0,
  'TYPENewTreeRequests': 0,
  'TYPENoPriceonGasWrongPrice': 0,
  'TYPENoTowComplaintConfirmation': 0,
  'TYPENotification': 0,
  'TYPEOCRFrontDeskInteractions': 0,
  'TYPEOvercrowding': 0,
  'TYPEOverflowingorUnkeptDumpster': 0,
  'TYPEParkImprovementRequests': 0,
  'TYPEParkMaintenanceRequests': 0,
  'TYPEParkingEnforcement': 1,
  'TYPEParkingonFrontBackYardsIllegalParking': 0,
  'TYPEParksLightingElectricalIssues': 0,
  'TYPEPavementMarkingMaintenance': 0,
  'TYPEPestInfestationResidential': 0,
  'TYPEPickupDeadAnimal': 0,
  'TYPEPigeonInfestation': 0,
  'TYPEPlumbing': 0,
  'TYPEPoorConditionsofProperty': 0,
  'TYPEProductShortMeasure': 0,
  'TYPEProtectionofAdjoiningProperty': 0,
  'TYPEPublicWorksGeneralRequest': 0,
  'TYPERecyclingCartInquiry': 0,
  'TYPERecyclingCartReturn': 0,
  'TYPERequestforPotholeRepair': 0,
  'TYPERequestforRecyclingCart': 0,
  'TYPERequestforSnowPlowing': 0,
  'TYPERequestforSnowPlowingEmergencyResponder': 0,
  'TYPERequestsforStreetCleaning': 0,
  'TYPERequestsforTrafficSignalStudiesorReviews': 0,
  'TYPERoadwayRepair': 0,
  'TYPERodentActivity': 0,
  'TYPEScaleNotVisible': 0,
  'TYPEScanningOvercharge': 0,
  'TYPEScheduleaBulkItemPickup': 0,
  'TYPEShortMeasureGas': 0,
  'TYPESidewalkRepair': 0,
  'TYPESignRepair': 0,
  'TYPESpaceSavers': 0,
  'TYPESqualidLivingConditions': 0,
  'TYPEStickerRequest': 0,
  'TYPEStreetLightKnockDowns': 0,
  'TYPEStreetLightOutages': 0,
  'TYPEStudentMoveinIssues': 0,
  'TYPETrafficSignalInspection': 0,
  'TYPETrafficSignalRepair': 0,
  'TYPETransportationGeneralRequest': 0,
  'TYPETrashonVacantLot': 0,
  'TYPETreeEmergencies': 0,
  'TYPETreeMaintenanceRequests': 0,
  'TYPETreeinPark': 0,
  'TYPEUnsafeDangerousConditions': 0,
  'TYPEUnsatisfactoryLivingConditions': 0,
  'TYPEUnsatisfactoryUtilitiesElectricalPlumbing': 0,
  'TYPEUnshoveledSidewalk': 0,
  'TYPEUpgradeExistingLighting': 0,
  'TYPEUtilityCallIn': 0,
  'TYPEWCCallLog': 0,
  'TYPEWaterinGasHighPriority': 0,
  'TYPEWorkingBeyondHours': 0,
  'TYPEWorkwoutPermit': 0,
  'neighborhoodfromzipEastBoston': 0,
  'neighborhoodfromzipNorthEnd': 0,
  'queuewkopen': 351}]