# Chapter 15 - Regularization

In [1]:
# A technique to improve performance on test data
import pandas as pd


In [2]:
# let's import a dataset

acs = pd.read_csv("C:/Users/adri_/Documents/GitHub- Adriana/Pandas for everyone/data/acs_ny.csv")
acs

Unnamed: 0,Acres,FamilyIncome,FamilyType,NumBedrooms,NumChildren,NumPeople,NumRooms,NumUnits,NumVehicles,NumWorkers,OwnRent,YearBuilt,HouseCosts,ElectricBill,FoodStamp,HeatingFuel,Insurance,Language
0,1-10,150,Married,4,1,3,9,Single detached,1,0,Mortgage,1950-1959,1800,90,No,Gas,2500,English
1,1-10,180,Female Head,3,2,4,6,Single detached,2,0,Rented,Before 1939,850,90,No,Oil,0,English
2,1-10,280,Female Head,4,0,2,8,Single detached,3,1,Mortgage,2000-2004,2600,260,No,Oil,6600,Other European
3,1-10,330,Female Head,2,1,2,4,Single detached,1,0,Rented,1950-1959,1800,140,No,Oil,0,English
4,1-10,330,Male Head,3,1,2,5,Single attached,1,0,Mortgage,Before 1939,860,150,No,Gas,660,Spanish
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22740,10+,565000,Married,5,3,5,10,Single detached,2,2,Mortgage,1990-1999,1700,370,No,Gas,1000,English
22741,10+,599000,Married,4,0,2,6,Single detached,2,2,Mortgage,Before 1939,1300,100,No,Gas,3500,English
22742,10+,611700,Married,4,1,5,9,Single detached,5,3,Mortgage,Before 1939,410,100,No,Oil,1300,Spanish
22743,10+,621430,Married,3,2,4,11,Single detached,2,3,Mortgage,1970-1979,1600,80,No,Gas,800,Spanish


In [3]:
#let's see the columns

acs.columns

Index(['Acres', 'FamilyIncome', 'FamilyType', 'NumBedrooms', 'NumChildren',
       'NumPeople', 'NumRooms', 'NumUnits', 'NumVehicles', 'NumWorkers',
       'OwnRent', 'YearBuilt', 'HouseCosts', 'ElectricBill', 'FoodStamp',
       'HeatingFuel', 'Insurance', 'Language'],
      dtype='object')

In [4]:
#let's use patsy to create the designs
from patsy import dmatrices


In [5]:
# model

response,predictors, = dmatrices("FamilyIncome ~ NumBedrooms + NumChildren + NumPeople + NumRooms + NumUnits + NumVehicles + NumWorkers + OwnRent + YearBuilt + ElectricBill + FoodStamp +HeatingFuel + Insurance + Language", data = acs)

In [6]:
# Splitting the data ino training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(predictors, response, random_state = 0)


In [7]:
# fitting our model

from sklearn.linear_model import LinearRegression

lr = LinearRegression(normalize = True).fit(X_train, y_train)

model_coefs = pd.DataFrame(list(zip(predictors.design_info.column_names,lr.coef_[0])), columns = ["variable", "coef_lr"])

model_coefs

Unnamed: 0,variable,coef_lr
0,Intercept,3.52266e-11
1,NumUnits[T.Single attached],31356.46
2,NumUnits[T.Single detached],24183.68
3,OwnRent[T.Outright],28391.86
4,OwnRent[T.Rented],7229.586
5,YearBuilt[T.1940-1949],12921.69
6,YearBuilt[T.1950-1959],20577.93
7,YearBuilt[T.1960-1969],17648.35
8,YearBuilt[T.1970-1979],17568.81
9,YearBuilt[T.1980-1989],25525.66


In [8]:
# Now we can look at our model scores
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.2726140465638567
0.26976979568488124


# 15.3 Lasso regression (Least Absolute Shrinkage and Selection Operator)

In [9]:
from sklearn.linear_model import Lasso

In [10]:
#Create model

lasso = Lasso(normalize = True, random_state = 0).fit(X_test,y_test)

In [11]:
# Getting a dataframe of coefficients
coefs_lasso = pd.DataFrame(list(zip(predictors.design_info.column_names, lasso.coef_)), columns = ["variable", "coef_lasso"])


In [12]:
# Merging

model_coefs = pd.merge(model_coefs, coefs_lasso, on="variable")
model_coefs

Unnamed: 0,variable,coef_lr,coef_lasso
0,Intercept,3.52266e-11,0.0
1,NumUnits[T.Single attached],31356.46,23847.097905
2,NumUnits[T.Single detached],24183.68,20278.620009
3,OwnRent[T.Outright],28391.86,30153.611697
4,OwnRent[T.Rented],7229.586,1440.140884
5,YearBuilt[T.1940-1949],12921.69,-6382.312453
6,YearBuilt[T.1950-1959],20577.93,-905.14203
7,YearBuilt[T.1960-1969],17648.35,-0.0
8,YearBuilt[T.1970-1979],17568.81,-1579.827129
9,YearBuilt[T.1980-1989],25525.66,7854.066748


In [13]:
# Looking at our training and test data scores

print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.26670104659430227
0.27506204638605314


# 15.4 Ridge Regression

In [14]:
from sklearn.linear_model import Ridge

ridge = Ridge(normalize = True, random_state = 0).fit(X_train, y_train)
coefs_ridge = pd.DataFrame(list(zip(predictors.design_info.column_names, ridge.coef_[0])), columns = ["variable", "coef_ridge"])

modeL_coefs = pd.merge(model_coefs, coefs_ridge, on= "variable")
model_coefs


Unnamed: 0,variable,coef_lr,coef_lasso
0,Intercept,3.52266e-11,0.0
1,NumUnits[T.Single attached],31356.46,23847.097905
2,NumUnits[T.Single detached],24183.68,20278.620009
3,OwnRent[T.Outright],28391.86,30153.611697
4,OwnRent[T.Rented],7229.586,1440.140884
5,YearBuilt[T.1940-1949],12921.69,-6382.312453
6,YearBuilt[T.1950-1959],20577.93,-905.14203
7,YearBuilt[T.1960-1969],17648.35,-0.0
8,YearBuilt[T.1970-1979],17568.81,-1579.827129
9,YearBuilt[T.1980-1989],25525.66,7854.066748


# 15.5 Elastic Net

In [15]:
from sklearn.linear_model import ElasticNet

en = ElasticNet(random_state = 42).fit(X_train, y_train)

coefs_en = pd.DataFrame(list(zip(predictors.design_info.column_names, en.coef_)), columns = ["variable", "coef_en"])

modeL_coefs = pd.merge(model_coefs, coefs_en, on= "variable")
model_coefs

Unnamed: 0,variable,coef_lr,coef_lasso
0,Intercept,3.52266e-11,0.0
1,NumUnits[T.Single attached],31356.46,23847.097905
2,NumUnits[T.Single detached],24183.68,20278.620009
3,OwnRent[T.Outright],28391.86,30153.611697
4,OwnRent[T.Rented],7229.586,1440.140884
5,YearBuilt[T.1940-1949],12921.69,-6382.312453
6,YearBuilt[T.1950-1959],20577.93,-905.14203
7,YearBuilt[T.1960-1969],17648.35,-0.0
8,YearBuilt[T.1970-1979],17568.81,-1579.827129
9,YearBuilt[T.1980-1989],25525.66,7854.066748


# 15.6 Cross-validation

In [16]:
from sklearn.linear_model import ElasticNetCV

en_cv = ElasticNetCV(cv = 5, random_state = 42).fit(X_train, y_train)

coefs_en_cv = pd.DataFrame(list(zip(predictors.design_info.column_names, en_cv.coef_)), columns = ["variable", "coef_en_cv"])

modeL_coefs = pd.merge(model_coefs, coefs_en_cv, on= "variable")
model_coefs

  y = column_or_1d(y, warn=True)


Unnamed: 0,variable,coef_lr,coef_lasso
0,Intercept,3.52266e-11,0.0
1,NumUnits[T.Single attached],31356.46,23847.097905
2,NumUnits[T.Single detached],24183.68,20278.620009
3,OwnRent[T.Outright],28391.86,30153.611697
4,OwnRent[T.Rented],7229.586,1440.140884
5,YearBuilt[T.1940-1949],12921.69,-6382.312453
6,YearBuilt[T.1950-1959],20577.93,-905.14203
7,YearBuilt[T.1960-1969],17648.35,-0.0
8,YearBuilt[T.1970-1979],17568.81,-1579.827129
9,YearBuilt[T.1980-1989],25525.66,7854.066748
