<a href="https://colab.research.google.com/github/abel-keya/machine_learning-_supervised_learning_with_python/blob/master/Python_Programming_Elastic_Net_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font color="green">*To start working on this notebook, or any other notebook that we will use in the Moringa Data Science Course, we will need to save our own copy of it. We can do this by clicking File > Save a Copy in Drive. We will then be able to make edits to our own copy of this notebook.*</font>

# Python Programming: Elastic Net Regression 

## Example

In [0]:
# Example 1
# ---
# Use the fair dataset from the pydataset library to predict marriage satisfaction based on the given variables.
# ---
# 

In [0]:
# Importing our libraries
# 
from pydataset import data
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [0]:
# Data preparation
# 
df=pd.DataFrame(data('Fair'))
df.loc[df.sex== 'male', 'sex'] = 0
df.loc[df.sex== 'female','sex'] = 1
df['sex'] = df['sex'].astype(int)
df.loc[df.child== 'no', 'child'] = 0
df.loc[df.child== 'yes','child'] = 1
df['child'] = df['child'].astype(int)
X=df[['religious','age','sex','ym','education','occupation','nbaffairs']]
y=df['rate']

In [0]:
# Creating our linear regression model for the purpose of comparison
# 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model) 

# This mean standard error score of 1.05 is our benchmark for determining 
# if the elastic net model will be better or worst. 

1.0498738644696668


In [0]:
# Below are the coefficients of this first model. We use a for loop to go through 
# the model and the zip function to combine the two columns.
# 
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'religious': 0.04235281110639181,
 'age': -0.009059645428673794,
 'sex': 0.08882013337087086,
 'ym': -0.030458802565476582,
 'education': 0.06810255742293699,
 'occupation': -0.005979506852998183,
 'nbaffairs': -0.07882571247653962}

In [0]:
# Elastic Net Model
# Elastic net, just like ridge and lasso regression, requires normalize data. 
# This argument  is set inside the ElasticNet function. 
# The second thing we need to do is create our grid.
# 
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [0]:
# We will now fit our model and display the best parameters and the best results we can get with that setup.
# 
search.fit(X,y)
search.best_params_
abs(search.best_score_)

1.0819158709244472

In [0]:
# The best hyperparameters was an alpha set to 0.001 and a l1_ratio of 0.8. 
# With these settings we got an MSE of 1.08. This is above our baseline model of MSE 1.05  for the baseline model. 
# Which means that elastic net is doing worse than linear regression. 
# For clarity, we will set our hyperparameters to the recommended values and run on the data.
# 
elastic=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.75)
elastic.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(second_model)

1.0566430678343806


In [0]:
# Below are the coefficients
# 
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

# The coefficients are mostly the same. 
# Notice that occupation was completely removed from the model in the elastic net version. 
# This means that this values was no good to the algorithm. Traditional regression cannot do this.

{'religious': 0.01947541724957858,
 'age': -0.008630896492807688,
 'sex': 0.018116464568090875,
 'ym': -0.02422483127451297,
 'education': 0.04429085595448633,
 'occupation': -0.0,
 'nbaffairs': -0.06679513627963517}

## Challenges

### <font color="green">Challenge 1</font>

In [0]:
# Challenge 1
# ---
# Question: Using the given housiet, create a regression model to predict 
# the value of prices of a house using the given features. 
# ---
Dataset url = http://bit.ly/BostonHousingDataset

In [0]:
# ---
# Importing our libraries
# 
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
boston=pd.read_csv('BostonHousing.csv')
boston

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [0]:
boston.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat', 'medv'], dtype='object')

In [0]:
X=boston.drop(['medv'],axis=1)
y=boston[['medv']]
X.shape,y.shape

((506, 13), (506, 1))

In [0]:
# Creating our linear regression model for the purpose of comparison
# 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model) 

# This mean standard error score of 21.894 is our benchmark for determining 
# if the elastic net model will be better or worst. 

21.894831181729206


In [0]:
# Below are the coefficients of this first model. 
#for loop to go through the model and the zip function to combine the two columns.
# 
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'crim': array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
        -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
         3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
        -5.24758378e-01])}

In [0]:
# Elastic Net Model
#create our gridsearch.
# 
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [0]:
#fit model 
#display the best parameters and the best results from the gridsearch
# 
search.fit(X,y)
search.best_params_
abs(search.best_score_)

31.1862828126433

In [0]:
#set our hyperparameters to the recommended values and run on the data.
# 
elastic=ElasticNet(normalize=True,alpha=0.0001,l1_ratio=0.9)
elastic.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print("MSE for the second model: ",second_model)

MSE for the second model:  21.903757015491436


In [0]:
# elastic model coefficients
# 
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

# The coefficients are mostly the same. 
# Notice that occupation was completely removed from the model in the elastic net version. 
# This means that this values was no good to the algorithm. Traditional regression cannot do this.

{'crim': -0.10499949587835906,
 'zn': 0.04443640337653871,
 'indus': 0.009570853913532906,
 'chas': 2.7206559451281493,
 'nox': -17.03921012355745,
 'rm': 3.843170802382751,
 'age': 0.0,
 'dis': -1.4392376367913107,
 'rad': 0.2822195415803222,
 'tax': -0.011179311766104769,
 'ptratio': -0.9412493159910819,
 'b': 0.009278743997347183,
 'lstat': -0.5198213919320526}

### <font color="green">Challenge 2</font>

In [0]:
# Challenge 2
# ---
# Question: Using the Ames Housing dataset, create a regression model to predict the sales price of home 
# applying elastic net regression.
# ---
# Dataset Source = http://bit.ly/HousePricesDataset
housing=pd.read_csv('train_house.csv')
housing.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [0]:
housing.shape

(1460, 81)

In [0]:
#drop collumns with 100 NaN
housing=housing.drop(['Alley','PoolQC','Fence','MiscFeature'],axis=1)
housing.shape

(1460, 77)

In [0]:
#check unique values
housing.nunique()

Id               1460
MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
Street              2
LotShape            4
LandContour         4
Utilities           2
LotConfig           5
LandSlope           3
Neighborhood       25
Condition1          9
Condition2          8
BldgType            5
HouseStyle          8
OverallQual        10
OverallCond         9
YearBuilt         112
YearRemodAdd       61
RoofStyle           6
RoofMatl            8
Exterior1st        15
Exterior2nd        16
MasVnrType          4
MasVnrArea        327
ExterQual           4
ExterCond           5
Foundation          6
BsmtQual            4
BsmtCond            4
BsmtExposure        4
BsmtFinType1        6
BsmtFinSF1        637
BsmtFinType2        6
BsmtFinSF2        144
BsmtUnfSF         780
TotalBsmtSF       721
Heating             6
HeatingQC           5
CentralAir          2
Electrical          5
1stFlrSF          753
2ndFlrSF          417
LowQualFinSF       24
GrLivArea 

In [0]:
#housing=housing.apply(lambda x: x.str.replace('[^a-zA-Z0-9]', '').lower(),axis=0)
cols = housing.select_dtypes('object').columns
print (cols)

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'], dtype='object')


In [0]:
#removing symbols in the dataset #Index(['col'], dtype='object')

housing[cols] = housing[cols].astype(str).replace('[^a-zA-Z0-9 ]', '', regex=True)
housing.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,0,12,2008,WD,Normal,250000


In [0]:
# Importing label encoder
# 
from sklearn.preprocessing import LabelEncoder
# Categorical boolean mask
categorical_feature_mask =housing.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_features = housing.columns[categorical_feature_mask].tolist()
categorical_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [0]:
len(categorical_features)

39

In [0]:
#initializing the label encoder
le = LabelEncoder()
# Converting the variables to numerical
#
for i in range(39):
    new = le.fit_transform(housing[categorical_features[i]])
    housing[categorical_features[i]] = new
housing.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,3,3,0,4,0,5,2,2,0,5,7,5,2003,2003,1,1,12,13,1,196.0,2,4,2,2,3,3,2,706,5,0,150,856,1,0,1,4,856,854,0,1710,1,0,2,1,3,1,2,8,6,0,5,1,2003.0,1,2,548,4,4,2,0,61,0,0,0,0,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,3,3,0,2,0,24,1,2,0,2,6,8,1976,1976,1,1,8,8,2,0.0,3,4,1,2,3,1,0,978,5,0,284,1262,1,0,1,4,1262,0,0,1262,0,1,2,0,3,1,3,6,6,1,4,1,1976.0,1,2,460,4,4,2,298,0,0,0,0,0,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,0,3,0,4,0,5,2,2,0,5,7,5,2001,2002,1,1,12,13,1,162.0,2,4,2,2,3,2,2,486,5,0,434,920,1,0,1,4,920,866,0,1786,1,0,2,1,3,1,2,6,6,1,4,1,2001.0,1,2,608,4,4,2,0,42,0,0,0,0,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,0,3,0,0,0,6,2,2,0,5,7,5,1915,1970,1,1,13,15,2,0.0,3,4,0,3,1,3,0,216,5,0,540,756,1,2,1,4,961,756,0,1717,1,0,1,0,3,1,2,7,6,1,2,5,1998.0,2,3,642,4,4,2,0,35,272,0,0,0,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,0,3,0,2,0,15,2,2,0,5,8,5,2000,2000,1,1,12,13,1,350.0,2,4,2,2,3,0,2,655,5,0,490,1145,1,0,1,4,1145,1053,0,2198,1,0,2,1,4,1,2,9,6,1,4,1,2000.0,1,3,836,4,4,2,192,84,0,0,0,0,0,12,2008,8,4,250000


In [0]:
housing.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
LotShape           0
LandContour        0
Utilities          0
LotConfig          0
LandSlope          0
Neighborhood       0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
RoofMatl           0
Exterior1st        0
Exterior2nd        0
MasVnrType         0
MasVnrArea         8
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual           0
BsmtCond           0
BsmtExposure       0
BsmtFinType1       0
BsmtFinSF1         0
BsmtFinType2       0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
Heating            0
HeatingQC          0
CentralAir         0
Electrical         0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath 

In [0]:
#droping columns with NaN
housing.dropna(inplace=True)

In [0]:
#check for NaN
housing.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual 

In [0]:
# Importing our libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [0]:
#Creating dependent and independent variables
X=housing.drop(['SalePrice'],axis=1)
y=housing[['SalePrice']]
X.shape,y.shape

((1121, 76), (1121, 1))

In [0]:
# Creating our linear regression model for the purpose of comparison
# 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model) 

# This mean standard error score of 1063904412.1366453 is our benchmark for determining 
# if the elastic net model will be better or worst. 

1063904412.1366453


In [0]:

# model coefficients. 
#  use a for loop to go through the model and the zip function to combine the two columns.
# 
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'Id': array([-3.01279417e+00, -1.52689732e+02, -6.83715533e+02, -1.76858366e+02,
         4.89226076e-01,  3.97500464e+04, -1.29868540e+03,  4.03291172e+03,
         1.07796438e-10, -2.32145367e+02,  5.49369277e+03,  4.73180330e+02,
        -3.86469547e+02, -1.16598536e+04, -2.22837665e+03, -1.12051271e+03,
         1.13968614e+04,  4.35344586e+03,  1.94854284e+02,  1.97716840e+01,
         2.57630847e+03,  6.29462239e+03, -5.78547961e+02,  5.54984582e-02,
         4.45815220e+03,  3.06067008e+01, -1.00247518e+04,  3.14825974e+02,
         1.58048812e+03, -8.27373543e+03,  2.92742797e+03, -3.81128705e+03,
        -1.34689467e+03,  2.36389822e+00,  2.01137273e+02,  4.88100465e+00,
        -6.54891022e+00,  6.95992635e-01,  6.98826065e+02, -4.41443262e+02,
         3.60072562e+03, -2.90456556e+02,  1.81588839e+01,  1.48391457e+01,
        -1.34355453e+00,  3.16544751e+01,  6.57871056e+03, -7.47443671e+02,
         6.14113833e+03,  8.76751970e+02, -4.54533210e+03, -1.70132494e+04,
      

In [0]:
# Elastic Net Model
#  normalize data. 
#
# create our gridsearch.
# 
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [0]:
# fit the model 
#display the best parameters and the best results 
# 
search.fit(X,y)
search.best_params_
abs(search.best_score_)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


1332857235.9702218

In [0]:
# with the best hyperparameters 
# 
elastic=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.75)
elastic.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(second_model)

1111776992.115193


In [0]:
# Below are the coefficients
# 
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'Id': -2.9651444794489796,
 'MSSubClass': -80.27677952485192,
 'MSZoning': -385.30205138294974,
 'LotFrontage': -65.12085681555386,
 'LotArea': 0.41688792466736824,
 'Street': 35004.311760416116,
 'LotShape': -1471.0027108771865,
 'LandContour': 2332.442610777802,
 'Utilities': 0.0,
 'LotConfig': -278.8054153062683,
 'LandSlope': 4503.199397929825,
 'Neighborhood': 462.8750497594924,
 'Condition1': -238.67090981722882,
 'Condition2': -7404.91365659976,
 'BldgType': -2150.81633541398,
 'HouseStyle': -647.1664779804723,
 'OverallQual': 8148.69463286644,
 'OverallCond': 3025.202159415981,
 'YearBuilt': 74.00180076878148,
 'YearRemodAdd': 112.1202624831934,
 'RoofStyle': 2976.6295622996804,
 'RoofMatl': 6227.66669731974,
 'Exterior1st': -390.82901203505673,
 'Exterior2nd': -4.206037447387061,
 'MasVnrType': 3448.5520949714614,
 'MasVnrArea': 28.499730555766508,
 'ExterQual': -9580.851687900504,
 'ExterCond': 277.6107725272629,
 'Foundation': 2076.5666069312356,
 'BsmtQual': -7218.35082488

### <font color="green">Challenge 3</font>

In [0]:
# Challenge 3
# ---
# Question: Given the medical cost personal dataset, accurately predict insurance cost using a regression model.
# ---
# Dataset Source = http://bit.ly/MedicalInsuranceDataset
# 
insurance=pd.read_csv('insurance.csv')
insurance

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [0]:
#checking the shape of the dataset
insurance.shape

(1338, 7)

In [0]:
# Importing label encoder
# 
from sklearn.preprocessing import LabelEncoder
# Categorical boolean mask
categorical_feature_mask =insurance.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_features = insurance.columns[categorical_feature_mask].tolist()
categorical_features

['sex', 'smoker', 'region']

In [0]:

#initializing the label encoder
le = LabelEncoder()
# Converting the variables to numerical
#
for i in range(3):
    new = le.fit_transform(insurance[categorical_features[i]])
    insurance[categorical_features[i]] = new
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [0]:
# Importing our libraries
# 
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X=insurance.drop(['charges'],axis=1)
y=insurance[['charges']]
X.shape,y.shape

((1338, 6), (1338, 1))

In [0]:
# Creating the baseline linear regression model for the purpose of comparison
# 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model) 

# This mean standard error score of 36527659.88568238 as the benchmark for determining 
# if the elastic net model will be better or worst. 

36527659.88568238


In [0]:
# Below are the coefficients of the first model/baseline . 
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'age': array([  257.28807486,  -131.11057962,   332.57013224,   479.36939355,
        23820.43412267,  -353.64001656])}

In [0]:
# Elastic Net Model
#  normalize the data. 
#  
#  create the gridsearch.
# 
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [0]:
# fit the model 
#display the best parameters and the best results 
# 
search.fit(X,y)
search.best_params_
abs(search.best_score_)

36957169.86538891

In [0]:
# The best hyperparameters  
# 
elastic=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.75)
elastic.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(second_model)

43454295.14222847


In [0]:
# Below are the coefficients
# 
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'age': 192.5538439388151,
 'sex': 183.9408187842989,
 'bmi': 258.9131242175576,
 'children': 395.16164510736604,
 'smoker': 17790.31108888592,
 'region': -218.61719166086226}

### <font color="green">Challenge 4</font>

In [0]:
# Challenge 4
# ---
# Question: Use ElasticNet regression to build a model that is able to accurately predict the profits of a startup.
# ---
# Dataset Source = http://bit.ly/StartupsDataset
# ---
# 
startups=pd.read_csv('Startups.csv')
startups.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [0]:
#checking the shape
startups.shape

(50, 5)

In [0]:
#checking the datatypes
startups.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [0]:
# dummy conversions
#pd.get_dummies(startups, columns=["State"], prefix=["State"],drop_first=False).head()
#create dummy variables for result
startups_dummy = pd.get_dummies(startups['State'],drop_first=False)
#previewing the top 
startups_dummy.head()

Unnamed: 0,California,Florida,New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [0]:
#combining the two datasets
startups = pd.concat([startups,startups_dummy],axis=1) 
startups=startups.drop(['State'],axis=1)
startups.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,California,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


In [0]:
startups.shape

(50, 7)

In [0]:
#determine the number of unique values in every column
startups.nunique()

R&D Spend          49
Administration     50
Marketing Spend    48
Profit             50
California          2
Florida             2
New York            2
dtype: int64

In [0]:
# Importing our libraries
# 
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# 
X=startups.drop(['Profit'],axis=1)
y=startups[['Profit']]
X.shape,y.shape

((50, 6), (50, 1))

In [0]:
# Creating  linear regression model as a baseline 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model) 

#  mean standard error score of 78416791.01666646(78406792.88803762) is the  benchmark for determining ,
# if the elastic net model will be better or worst. 

78406792.88803762


In [0]:
#  coefficients of thefirst model. 
#for loop to goes through the model and the zip function to combine the two columns.
# 
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'R&D Spend': array([ 8.06023114e-01, -2.70043196e-02,  2.69798610e-02, -5.23005912e+01,
         1.46488202e+02, -9.41876104e+01])}

In [0]:
# Elastic Net Model
# normalize data.  
# create the gridsearch.
# 
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [0]:
# fit model 
#display the best parameters and the best results.
# 102929988.21892013(103359804.47611961)
search.fit(X,y)
search.best_params_
abs(search.best_score_)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)


103359804.47611961

In [0]:
# Elastic model performance
elastic=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.75)
elastic.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(second_model)

78828733.04629785


In [0]:
#  the coefficients
# {'R&D Spend': 0.7833652863082342,'Administration': -0.01695518117671228,'Marketing Spend': 0.03294491814412594,'State': 37.26905179033893}
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'R&D Spend': 0.7835108316981487,
 'Administration': -0.017055854989690047,
 'Marketing Spend': 0.03280005922052856,
 'California': -81.62777110169262,
 'Florida': 90.17751858933816,
 'New York': -5.071221239340671}

### <font color="green">Challenge 5</font>

In [0]:
# Challenge 5
# ---
# Question: Build a prediction model to predict duration for any combination of country,operator, 
# services and category given the genre,language and number of units. 
# Apply ElasticNet regression while building your model. 
# ---
# Dataset Source = http://bit.ly/AudioContentConsumptionDataset
# ---
# 
consumption=pd.read_csv('consumption.csv',encoding='latin1')
consumption.head()

Unnamed: 0,language_name,genre,service,country,mobile_operator,usage_category,number_of_units,duration
0,afrikaans,pop,music on demand,chad,airtel,int,2,9
1,arabic,inspirational,music on demand,zambia,airtel,ramadanc,3,181
2,arabic,islamic,music on demand,chad,airtel,islam,61,2102
3,arabic,islamic,music on demand,chad,airtel,quran,11,179
4,arabic,world,music on demand,zambia,airtel,ramadanc,3,24


In [0]:
#check the shape
consumption.shape

(215, 8)

In [0]:
consumption.nunique()

language_name       17
genre               64
service              4
country              3
mobile_operator      2
usage_category      32
number_of_units    104
duration           192
dtype: int64

In [0]:
consumption.dtypes

language_name      object
genre              object
service            object
country            object
mobile_operator    object
usage_category     object
number_of_units     int64
duration            int64
dtype: object

In [0]:
#
#data['Status'].unique()
#pd.get_dummies(data, columns=["Status"], prefix=["Status"],drop_first=True).head()

In [0]:
# Importing label encoder
# 
from sklearn.preprocessing import LabelEncoder
# Categorical boolean mask
categorical_feature_mask = consumption.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_features = consumption.columns[categorical_feature_mask].tolist()
categorical_features

['language_name',
 'genre',
 'service',
 'country',
 'mobile_operator',
 'usage_category']

In [0]:
#initializing the label encoder
le = LabelEncoder()
# Converting the variables to numerical
for i in range(6):
    new = le.fit_transform(consumption[categorical_features[i]])
    consumption[categorical_features[i]] = new
consumption.head()

Unnamed: 0,language_name,genre,service,country,mobile_operator,usage_category,number_of_units,duration
0,0,48,1,0,0,10,2,9
1,1,31,1,2,0,21,3,181
2,1,34,1,0,0,11,61,2102
3,1,34,1,0,0,20,11,179
4,1,62,1,2,0,21,3,24


In [0]:
consumption.shape

(215, 8)

In [0]:
# Importing our libraries
# 
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X=consumption.drop(['duration'],axis=1)
y=consumption[['duration']]
X.shape,y.shape

((215, 7), (215, 1))

In [0]:
# Creating the linear regression model 
# 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model) 

993723097.0287952


In [0]:
# 
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'language_name': array([   -55.66772427,    100.4162884 , -39931.18481794,   -346.72512191,
        -39644.49439572,    150.23883256,     41.84253902])}

In [0]:
# 
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [0]:
#  fit the model and display the best parameters and the best results we can get with that setup.
# 
search.fit(X,y)
search.best_params_
abs(search.best_score_)

3779822714.381112

In [0]:
#set our hyperparameters to the recommended values and run on the data.
# 
elastic=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.75)
elastic.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(second_model)

1101444543.2339344


In [0]:
#the coefficients
# 
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'language_name': -125.77597054653384,
 'genre': 71.81959388252378,
 'service': -20330.873448440994,
 'country': -917.9311024990088,
 'mobile_operator': -19302.45686560342,
 'usage_category': 134.77227293241054,
 'number_of_units': 39.234502101964694}