In [1]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Read the given CSV file, and view some sample records

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.shape

(1460, 81)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


## EDA

few columns do not match the correct datatype according to there description given in the assignment. so converting them to correct types

In [7]:
df[['MSSubClass', 'OverallQual', 'OverallCond']] = df[['MSSubClass', 'OverallQual', 'OverallCond']].astype('object')
df['LotFrontage'] = pd.to_numeric(df['LotFrontage'], errors='coerce')
df['MasVnrArea'] = pd.to_numeric(df['MasVnrArea'], errors='coerce')
df['GarageYrBlt'] = pd.to_numeric(df['GarageYrBlt'], errors='coerce')

### droping any column that has majority of the values as NaN.
here majority means 90% of the records have NaN values for column under consideration. such columns are mostly useless and can't have any reliable relation to the target variable

In [8]:
df = df.drop(['Id'], axis=1)

In [9]:
na_counts = df.isna().sum(axis=0)
df = df.drop(na_counts[na_counts >= (df.shape[0] * 0.1)].index, axis=1)
df.shape

(1460, 74)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   object 
 1   MSZoning       1460 non-null   object 
 2   LotArea        1460 non-null   int64  
 3   Street         1460 non-null   object 
 4   LotShape       1460 non-null   object 
 5   LandContour    1460 non-null   object 
 6   Utilities      1460 non-null   object 
 7   LotConfig      1460 non-null   object 
 8   LandSlope      1460 non-null   object 
 9   Neighborhood   1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Condition2     1460 non-null   object 
 12  BldgType       1460 non-null   object 
 13  HouseStyle     1460 non-null   object 
 14  OverallQual    1460 non-null   object 
 15  OverallCond    1460 non-null   object 
 16  YearBuilt      1460 non-null   int64  
 17  YearRemodAdd   1460 non-null   int64  
 18  RoofStyl

#### taking care of all the remaing columns that have few missing values individually

I roughly investigated and figured out correct missing values for below remaining NaN values and filled it up using fillna

In [11]:
fillna_values = {'MasVnrType': 'None', 'MasVnrArea': 0.0, 'BsmtQual': 'None', 'BsmtCond': 'None',
            'BsmtExposure': 'None', 'BsmtFinType1': 'None', 'BsmtFinType2': 'None', 'Electrical': 'SBrkr',
                'GarageType': 'None', 'GarageYrBlt': 'None', 'GarageFinish': 'None', 'GarageQual': 'None', 'GarageCond': 'None'}

df = df.fillna(value=fillna_values)

> after this change we are left will bit lesser columns, column count reduced from 81 to 75, dropping 6 columns that mostly have NaN values in the recores.

### Outlier Handling

In [12]:
## Capping outliers to 5% at lower bound and 95% at upper bound
num_cols = list(df.select_dtypes(include=['int64', 'float64']).columns)
for col in num_cols:
    df[col][df[col] <= df[col].quantile(0.05)] = df[col].quantile(0.05)
    df[col][df[col] >= df[col].quantile(0.95)] = df[col].quantile(0.95)

### changing categorical columns to dummy columns

easiest way to incorporate all categorical columns without going through all of them (as there are quite a lot in this dataset) is to create dummy columns from the categorical columna and remove the original column.

In [13]:
# creating dummy variables for categorical variables

# subset all categorical variables
houses_categorical = df.select_dtypes(include=['object'])
houses_categorical.head()

Unnamed: 0,MSSubClass,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,KitchenQual,Functional,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,60,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Gd,Typ,Attchd,2003.0,RFn,TA,TA,Y,WD,Normal
1,20,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,TA,Typ,Attchd,1976.0,RFn,TA,TA,Y,WD,Normal
2,60,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Gd,Typ,Attchd,2001.0,RFn,TA,TA,Y,WD,Normal
3,70,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Gd,Typ,Detchd,1998.0,Unf,TA,TA,Y,WD,Abnorml
4,60,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Gd,Typ,Attchd,2000.0,RFn,TA,TA,Y,WD,Normal


In [14]:
# convert into dummies - one hot encoding
houses_dummies = pd.get_dummies(houses_categorical, drop_first=True)
houses_dummies.head()

Unnamed: 0,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [15]:
# drop categorical variables 
df = df.drop(list(houses_categorical.columns), axis=1)

In [16]:
# concat dummy variables with X
df = pd.concat([df, houses_dummies], axis=1)

In [17]:
df.shape

(1460, 365)

In [18]:
df.describe()

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,9682.319212,1971.787671,1984.794521,92.047945,431.089041,32.931233,556.537671,1055.950616,1151.86137,337.731267,...,0.003425,0.003425,0.083562,0.002055,0.867808,0.00274,0.008219,0.013699,0.820548,0.085616
std,3469.967624,28.872129,20.56562,140.590668,412.214952,100.569218,414.582219,346.901853,333.684061,415.255848,...,0.05844,0.05844,0.276824,0.045299,0.338815,0.052289,0.090317,0.116277,0.383862,0.279893
min,3311.7,1916.0,1950.0,0.0,0.0,0.0,0.0,519.3,672.95,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7553.5,1954.0,1967.0,0.0,0.0,0.0,223.0,795.75,882.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,9478.5,1973.0,1994.0,0.0,383.5,0.0,477.5,991.5,1087.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,11601.5,2000.0,2004.0,164.25,712.25,0.0,808.0,1298.25,1391.25,728.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,17401.15,2007.0,2007.0,456.0,1274.0,396.2,1468.0,1753.0,1831.25,1141.05,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


> we can see above that we ended up with only numeric columns which is great for analysing regression model further

## Data Preparation

In [19]:
x_columns = df.drop(['SalePrice'], axis=1).columns
X = df.loc[:, x_columns]
y = df['SalePrice']

### train-test split

In [20]:
# split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size = 0.3, random_state=100)

### Feature Scaling

it is good practice to scale features for Advanced Regression

In [21]:
## Create a list of numerical columns to be scaled
num_cols = list(X_train.select_dtypes(include=['int64', 'float64']).columns)

## Create a scaling instance
scaler = StandardScaler()

## Scale the numerical columns 
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

## Model Building

As we are building Advanced Linear Regression models which are Ridge and Lasso. it is good practice to determine correct `alpha` value for our dataset by using GridSearchCV and then create our LR models using the evaluated alpha values

> note: if the alpha value is too low then it will not be able to handle overfitting as expected. but if we assign a value that is very high, then we will get underfitting as a side-effect.

## Ridge Model

### determining optimal alpha value

In [22]:
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}

# cross validation
folds = 5
model_cv = GridSearchCV(estimator = Ridge(), 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error',  
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train) 

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [23]:
# Printing the best hyperparameter alpha
print(model_cv.best_params_)

{'alpha': 10.0}


### Fitting the Model

In [24]:
ridge = Ridge(alpha=model_cv.best_params_['alpha'])

ridge.fit(X_train, y_train)

In [25]:
ridge.coef_

array([ 4.47166337e+03,  7.23442002e+03,  4.41364452e+03,  1.65195496e+03,
        2.49818539e+03,  1.49781076e+01, -2.51839743e+03,  1.07203240e+04,
        3.51678713e+02,  5.82222937e+03,  0.00000000e+00,  1.89384987e+04,
        1.01748379e+03, -1.06948571e+03, -2.34824252e+01,  1.99812527e+02,
       -1.00566952e+03,  0.00000000e+00, -4.68628727e+02,  2.70623673e+03,
        2.20236820e+03,  4.93867377e+03,  1.86268867e+03,  7.25277575e+02,
       -1.03351988e+02,  0.00000000e+00,  4.19599783e+02,  0.00000000e+00,
        0.00000000e+00,  3.93121409e+02, -4.00221596e+02,  3.48339427e+02,
        1.39021734e+02,  2.87811944e+03,  7.95138742e+02,  3.35584395e+02,
        3.66747505e+03,  2.78574813e+03, -6.47682945e+02, -1.01830412e+03,
       -7.58106027e+03, -3.10052112e+03, -5.98989270e+03, -3.14001243e+03,
       -6.40158524e+03,  4.59315710e+03,  3.19153693e+03,  3.26641894e+03,
       -1.48799326e+03, -1.54038995e+03, -5.00864290e+02, -3.69131105e+03,
        5.94994490e+02,  

### calculate model accuracy

calculate mutliple accuracy terms such as R2 score, RSS and RMSE

In [26]:
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print(r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print(r2_test_lr)
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print(rss1_lr)
metric2.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print(rss2_lr)
metric2.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print(mse_train_lr)
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(mse_test_lr)
metric2.append(mse_test_lr**0.5)

0.9445577235259873
0.8949197526116583
238055942673.13965
186960104237.74423
233159591.25674793
426849553.0542106


## Lasso Model

### determining optimal alpha value

In [27]:
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}

# cross validation
folds = 5
model_cv = GridSearchCV(estimator = Lasso(), 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)             
model_cv.fit(X_train, y_train) 

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [28]:
# Printing the best hyperparameter alpha
print(model_cv.best_params_)

{'alpha': 100}


In [29]:
lasso = Lasso(alpha=model_cv.best_params_['alpha'])
        
lasso.fit(X_train, y_train) 

In [30]:
lasso.coef_

array([ 4.53939749e+03,  8.50808035e+03,  4.49935096e+03,  1.32746611e+03,
        4.22965493e+03,  0.00000000e+00, -4.97111952e+02,  9.51934608e+03,
       -0.00000000e+00,  3.05400504e+03,  0.00000000e+00,  2.05269053e+04,
        1.41304035e+03, -7.86445973e+02, -0.00000000e+00,  8.01999151e+02,
       -1.04985121e+03,  0.00000000e+00, -6.80037228e+02,  2.67405496e+03,
        1.67209118e+03,  4.18287692e+03,  1.86322993e+03,  3.85266612e+02,
        0.00000000e+00,  0.00000000e+00,  3.88800396e+02,  0.00000000e+00,
        0.00000000e+00,  9.71042567e+01, -3.22920800e+02,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -1.45491187e+04, -0.00000000e+00, -4.59272349e+03, -0.00000000e+00,
       -9.16925140e+03,  0.00000000e+00,  0.00000000e+00,  1.45339227e+03,
       -2.08442010e+03, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        1.20469848e+02,  

### calculate model accuracy

calculate mutliple accuracy terms such as R2 score, RSS and RMSE

In [31]:
y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)

metric3 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print(r2_train_lr)
metric3.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print(r2_test_lr)
metric3.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print(rss1_lr)
metric3.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print(rss2_lr)
metric3.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print(mse_train_lr)
metric3.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(mse_test_lr)
metric3.append(mse_test_lr**0.5)

0.9363660267112548
0.8920490727846311
273229139578.89355
192067654067.83484
267609343.36816216
438510625.7256503


## Conslusions

### compare metric results
lets look at all the metric together for both Ridge and Lasso, this will help us in determining which one is predicting better

In [32]:
# Creating a table which contain all the metrics

lr_table = {'Metric': ['R2 Score (Train)','R2 Score (Test)','RSS (Train)','RSS (Test)',
                       'MSE (Train)','MSE (Test)']
        }

lr_metric = pd.DataFrame(lr_table ,columns = ['Metric'] )

rg_metric = pd.Series(metric2, name = 'Ridge Regression')
ls_metric = pd.Series(metric3, name = 'Lasso Regression')

final_metric = pd.concat([lr_metric, rg_metric, ls_metric], axis = 1)

final_metric

Unnamed: 0,Metric,Ridge Regression,Lasso Regression
0,R2 Score (Train),0.9445577,0.936366
1,R2 Score (Test),0.8949198,0.8920491
2,RSS (Train),238055900000.0,273229100000.0
3,RSS (Test),186960100000.0,192067700000.0
4,MSE (Train),15269.56,16358.77
5,MSE (Test),20660.34,20940.65


Here we can safely say that Lasso's prediction is much more precise/accurate as compared to Ridge in this dataset. as we can see that Evaluation metric R2 Score's and better for Lasso.

### compare coef values

In [33]:
coefs = pd.DataFrame(index=X.columns)
coefs.rows = X.columns

coefs['Ridge'] = ridge.coef_
coefs['Lasso'] = lasso.coef_

In [34]:
pd.set_option('display.max_rows', None)
coefs.head(coefs.shape[0])

Unnamed: 0,Ridge,Lasso
LotArea,4471.663372,4539.397491
YearBuilt,7234.420024,8508.080349
YearRemodAdd,4413.644522,4499.350965
MasVnrArea,1651.954961,1327.466108
BsmtFinSF1,2498.185392,4229.654927
BsmtFinSF2,14.978108,0.0
BsmtUnfSF,-2518.397434,-497.111952
TotalBsmtSF,10720.323992,9519.346084
1stFlrSF,351.678713,-0.0
2ndFlrSF,5822.229372,3054.005043


> if you compare all the coefficients between _Ridge_ and _Lasso_, you can easily say that a lot of feature's coef were approaching to zero(0) in Ridge Regression Model but Lasso eliminated them. `Lasso Regression Model the features are eliminated by assigning exact zero coef to the feature, inturn making the model more precise` and use much lesser features.

## Assignment Question Work

#### question-1

In [35]:
ridge2 = Ridge(alpha=20.0)

ridge2.fit(X_train, y_train)

In [36]:
y_pred_train = ridge2.predict(X_train)
y_pred_test = ridge2.predict(X_test)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print(r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print(r2_test_lr)
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print(rss1_lr)
metric2.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print(rss2_lr)
metric2.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print(mse_train_lr)
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(mse_test_lr)
metric2.append(mse_test_lr**0.5)

0.9372553217396762
0.8957988229843927
269410718334.87024
185396051120.21252
263869459.68155754
423278655.52559936


In [37]:
lasso2 = Lasso(alpha=200.0)
        
lasso2.fit(X_train, y_train) 

In [38]:
y_pred_train = lasso2.predict(X_train)
y_pred_test = lasso2.predict(X_test)

metric3 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print(r2_train_lr)
metric3.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print(r2_test_lr)
metric3.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print(rss1_lr)
metric3.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print(rss2_lr)
metric3.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print(mse_train_lr)
metric3.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(mse_test_lr)
metric3.append(mse_test_lr**0.5)

0.9256276795217654
0.889662443417596
319337047217.6354
196313975201.70795
312768900.3111022
448205422.8349497


In [39]:
coefs['Ridge2'] = ridge2.coef_
coefs['Lasso2'] = lasso2.coef_

In [40]:
coefs['Lasso2'].sort_values(ascending=False)[:10]

OverallQual_9           44323.280364
OverallQual_8           30809.743942
GrLivArea               20647.939795
Neighborhood_Crawfor    16599.132979
Functional_Typ          14148.727037
Exterior1st_BrkFace     12388.178154
OverallCond_9           12161.862447
Neighborhood_Somerst    11259.624975
OverallQual_7           10469.172344
BsmtExposure_Gd          9456.083538
Name: Lasso2, dtype: float64

In [41]:
coefs['Ridge2'].sort_values(ascending=False)[:10]

OverallQual_9           18321.839488
GrLivArea               17086.684738
OverallQual_8           15582.299778
Neighborhood_Crawfor    11774.195154
Functional_Typ          10700.316884
OverallCond_9           10353.411098
TotalBsmtSF             10174.599817
Exterior1st_BrkFace      9361.443768
Neighborhood_NridgHt     8789.504371
Neighborhood_Somerst     8634.782518
Name: Ridge2, dtype: float64

#### question-3

In [42]:
top5_lasso = list(coefs['Lasso'].sort_values(ascending=False)[:5].index)
top5_lasso

['OverallQual_9',
 'OverallQual_8',
 'OverallCond_9',
 'GrLivArea',
 'Neighborhood_Crawfor']

In [43]:
X_train_dropped = X_train.drop(top5_lasso, axis = 1)
X_test_dropped = X_test.drop(top5_lasso, axis = 1)

In [44]:
lasso3 = Lasso(alpha=20.0)
        
lasso3.fit(X_train_dropped, y_train) 

In [45]:
y_pred_train = lasso3.predict(X_train_dropped)
y_pred_test = lasso3.predict(X_test_dropped)

metric3 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print(r2_train_lr)
metric3.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print(r2_test_lr)
metric3.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print(rss1_lr)
metric3.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print(rss2_lr)
metric3.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print(mse_train_lr)
metric3.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(mse_test_lr)
metric3.append(mse_test_lr**0.5)

0.950696679730601
0.87388161960473
211696725496.9937
224391416379.66095
207342532.31830922
512309169.8165775


In [46]:
coefs2 = pd.DataFrame(index=X_train_dropped.columns)
coefs2.rows = X.columns
coefs2['Lasso3'] = lasso3.coef_
coefs2['Lasso3'].sort_values(ascending=False)[:10]

Condition2_PosA        89293.897629
RoofMatl_WdShngl       28765.357152
RoofMatl_CompShg       27101.515618
GarageYrBlt_1939.0     22552.574021
RoofMatl_WdShake       21547.370480
2ndFlrSF               19254.646028
Exterior1st_BrkFace    18575.643418
RoofMatl_Roll          16133.343446
SaleType_Con           15031.722192
SaleType_ConLD         14753.047099
Name: Lasso3, dtype: float64