# Simple regression models for the Miami Project

## Data Setup

First importing libraries

In [1]:
import pandas as pd

Loading data and taking a look

In [46]:
data = pd.read_csv("miami_permit_data.csv")
data.head()

#using some data-cleaning code from Ivy
data['TotalDaysInPlanReviewNumeric'] = np.log(data['TotalDaysInPlanReviewNumeric'])
data = data.replace([np.inf, -np.inf], np.nan)
data = data[data['TotalDaysInPlanReviewNumeric'].notna() & data['TotalSQFT'].notnull() & data['ScopeofWork'].notnull() & data['PropertyType'].notnull() & data['IsPrivateProvider'].notnull()]

  """


Restricting data to the variables decided on by the ML team:
* PropertyType
* ScopeofWork
* TotalSQFT
* IsPrivateProvider

Dependent variable is 
* TotalDaysInPlanReviewNumeric

In [47]:
y = data['TotalDaysInPlanReviewNumeric']
X = data[['PropertyType', 'ScopeofWork', 'TotalSQFT','IsPrivateProvider']]

In [48]:
y.head()

0    3.044522
1    5.398163
2    1.609438
5    5.087596
6    0.000000
Name: TotalDaysInPlanReviewNumeric, dtype: float64

In [49]:
X.head()

Unnamed: 0,PropertyType,ScopeofWork,TotalSQFT,IsPrivateProvider
0,Commercial,NEW CONSTRUCTION,0.0,True
1,Residential,ADDITION AND REMODELING,880.0,False
2,Commercial,NEW CONSTRUCTION,0.0,True
5,Commercial,REMODELING/REPAIRS,800.0,False
6,Commercial,PLUMBING,0.0,False


One-hot encoding and splitting dataset into testing and training.

*PS: if someone knows how to do this better please let me know! I'm still a little new to one-hot encoding*

In [50]:
from sklearn.model_selection import train_test_split
import numpy as np

#Choosing columns that are categorical
cat_columns = ["PropertyType", "ScopeofWork","IsPrivateProvider"]

#Creating training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#One-hot encoding:
X_train_processed = pd.get_dummies(X_train, prefix_sep="__",
                              columns=cat_columns)

cat_dummies = [col for col in X_train_processed 
               if "__" in col 
               and col.split("__")[0] in cat_columns]

processed_columns = list(X_train_processed.columns[:])

X_test_processed = pd.get_dummies(X_test, prefix_sep="__", 
                                   columns=cat_columns)

In [51]:
X_train_processed.head()

Unnamed: 0,TotalSQFT,PropertyType__Commercial,PropertyType__Residential,ScopeofWork__ADDITION AND REMODELING,ScopeofWork__ANNUAL FACILITY,ScopeofWork__BOILER,ScopeofWork__BUILDING ROOFING,ScopeofWork__COOKIE CUTTER,ScopeofWork__DEMOLITION,ScopeofWork__ELECTRICAL,...,ScopeofWork__NEW CONSTRUCTION,ScopeofWork__PHASED PERMIT,ScopeofWork__PLUMBING,ScopeofWork__PLUMBING GAS,ScopeofWork__REMODELING/REPAIRS,ScopeofWork__SIGN,ScopeofWork__SPECIAL EVENTS,ScopeofWork__TREE PERMIT,IsPrivateProvider__False,IsPrivateProvider__True
96955,1500.0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
79448,0.0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
15733,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
58394,0.0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
52950,0.0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [52]:
X_test_processed.head()

Unnamed: 0,TotalSQFT,PropertyType__Commercial,PropertyType__Residential,ScopeofWork__ADDITION AND REMODELING,ScopeofWork__ANNUAL FACILITY,ScopeofWork__BOILER,ScopeofWork__BUILDING ROOFING,ScopeofWork__COOKIE CUTTER,ScopeofWork__DEMOLITION,ScopeofWork__ELECTRICAL,...,ScopeofWork__NEW CONSTRUCTION,ScopeofWork__PHASED PERMIT,ScopeofWork__PLUMBING,ScopeofWork__PLUMBING GAS,ScopeofWork__REMODELING/REPAIRS,ScopeofWork__SIGN,ScopeofWork__SPECIAL EVENTS,ScopeofWork__TREE PERMIT,IsPrivateProvider__False,IsPrivateProvider__True
4849,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
51550,0.0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
119222,0.0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
108255,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
66685,0.0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


### Trying out some models:

First, a ridge regression model:

In [59]:
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train_processed, y_train)

print("Training set score: {:.2f}".format(ridge.score(X_train_processed, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test_processed, y_test)))
#The “slope” parameters (w), also called weights or coefficients, are stored in the coef_
#..attribute, while the offset or intercept (b) is stored in the intercept_ attribute:

#cross validation
from sklearn.model_selection import cross_val_score
print("Cross Validation Score:",np.mean(cross_val_score(Ridge(), X_train_processed, y_train, cv=10, scoring="r2")))

print("lr.coef_: {}".format(ridge.coef_))
print("lr.intercept_: {}".format(ridge.intercept_))

Training set score: 0.12
Test set score: 0.13
Cross Validation Score: 0.12302776278019205
lr.coef_: [ 3.61795387e-06 -4.17439058e-02  4.17439058e-02  1.32223627e+00
 -6.21354242e-03  2.66484724e-01 -1.21279877e+00  1.29470144e+00
  5.12542908e-01 -6.16987028e-01 -3.45164091e-01 -6.16133339e-01
  5.75854361e-01 -5.60495778e-01  7.61273056e-01  1.40305860e-01
 -7.61524469e-01 -7.86668200e-01  3.33762294e-01  2.57249766e-01
 -1.55268883e+00  9.94263364e-01  1.55462969e-01 -1.55462969e-01]
lr.intercept_: 3.080982036295839


Just messing with the alpha parameter a bit:

In [64]:
#tuning ridge with GridSearchCV 
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [25,10,4,2,1.0,0.8,0.5,0.3,0.2,0.1,0.05,0.02,0.01]}

grid = GridSearchCV(Ridge(), param_grid)

grid.fit(X_train_processed, y_train)
print("best parameters: {}".format(grid.best_params_))
print("best mean cross-validation score for ridge: {:.3f}".format(grid.best_score_))

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


best parameters: {'alpha': 10}
best mean cross-validation score for ridge: 0.123


Printing R^2 with best hyperparameter values:

In [77]:
from sklearn.metrics import r2_score

ridge = Ridge(alpha=10).fit(X_train_processed, y_train)

y_pred = ridge.predict(X_test_processed)

print("R^2:",r2_score(y_test, y_pred))

R^2: 0.1269584797038199


Now, a Lasso regression model:

In [61]:
from sklearn.linear_model import Lasso
lasso = Lasso().fit(X_train_processed, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train_processed, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test_processed, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

print("lasso.coef_: {}".format(lasso.coef_))

Training set score: 0.01
Test set score: 0.02
Number of features used: 1
lasso.coef_: [ 4.03812558e-06 -0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00]


Seeing if I can mess with the hyperparameters:

In [65]:
#tuning lasso with GridSearchCV 
param_grid = {'alpha': [25,10,4,2,1.0,0.8,0.5,0.3,0.2,0.1,0.05,0.02,0.01], 'max_iter': [100000]}

grid = GridSearchCV(Lasso(), param_grid)

grid.fit(X_train_processed, y_train)
print("best parameters: {}".format(grid.best_params_))
print("best mean cross-validation score for lasso: {:.3f}".format(grid.best_score_))



best parameters: {'alpha': 0.01, 'max_iter': 100000}
best mean cross-validation score for lasso: 0.108


Printing R^2 with best hyperparameter values:

In [80]:
lasso = Lasso(alpha=0.01, max_iter=100000).fit(X_train_processed, y_train)

y_pred = lasso.predict(X_test_processed)

print("R^2:",r2_score(y_test, y_pred))

R^2: 0.11100299473436359
