In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [2]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer, StandardScaler
from sklearn.dummy import DummyRegressor

In [3]:
df_train = pd.read_csv('../datasets/complete_clean_train_extra.csv')
df_test = pd.read_csv('../datasets/complete_clean_test_extra.csv')

# Part I

The goal is to run all the features through the lasso regression model, and identify the features that have an effect on SalePrice. The lasso regression helps in getting rid of the features that do not affect the SalePrice. This will help in reducing the complexity of the model, thereby, reducing the variance. 

## Model Prep

In [4]:
features = list(df_train.select_dtypes(include=['int64','float64','uint8']).drop('SalePrice',axis=1).columns)

In [5]:
X_train = df_train[features]
y_train = df_train['SalePrice']

In [6]:
X_test = df_test[features]

## Scaling Features

In [7]:
ss = StandardScaler()

In [8]:
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

## Power Transforming the y

As mentioned before, the target variable 'SalePrice' is not normally distributed, it is skewed to the right. The 'SalePrice' variable is tehere being power transformed. This will make the distribution of the variable more Gaussian. In other words, this will stabilize the variance in target variable. 

In [9]:
pt = PowerTransformer()

In [10]:
y_train_pt = pt.fit_transform(y_train.to_frame())

## Instantiating the Model

In [11]:
l_alphas = np.logspace(-3,0,100)

In [12]:
lasso_cv = LassoCV(alphas = l_alphas)

## Model Validation

In [13]:
cross_val_score(lasso_cv,X_train_sc,y_train_pt).mean()

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.8500642289617744

## Fitting the Model

In [14]:
lasso_cv.fit(X_train_sc,y_train_pt)

  return f(*args, **kwargs)


LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]))

In [15]:
pred = lasso_cv.predict(X_test_sc)

In [16]:
pred_reversed = pt.inverse_transform(pred.reshape(-1,1))

In [17]:
pred_df = pd.DataFrame(data = pred_reversed,index = df_test['Id'],columns=['SalePrice'])
pred_df

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,138884.547901
2718,159203.578994
2414,213464.720838
1989,111828.713648
625,179096.194869
...,...
1662,193154.085649
1234,204011.658234
1373,126963.185073
1672,113765.831851


In [19]:
pred_df.to_csv('../datasets/kaggle_submission_recreate.csv')

In [61]:
kag_test = test[features_from_lasso]

In [65]:
kag_test_sc = ss.transform(kag_test)

In [67]:
kag_pred_lasso = lasso_cv.predict(kag_test_sc)

In [68]:
kag_pred_lasso_inverse = pt.inverse_transform(kag_pred_lasso.reshape(-1,1))

In [70]:
kag_subm = pd.DataFrame(kag_pred_lasso_inverse,index=test['Id'],columns=['SalePrice'])
kag_subm

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,137592.167464
2718,153702.970924
2414,221976.705894
1989,111085.292561
625,182102.671807
...,...
1662,194879.244946
1234,209231.124536
1373,124034.621163
1672,112485.886708


In [71]:
kag_subm.to_csv('../datasets/kaggle_submission.csv')

In [72]:
model_lasso = LassoCV(alphas = l_alphas)

In [76]:
X_train = df_train[features_from_lasso]
y_train = df_train['SalePrice']
X_test = test[features_from_lasso]

In [77]:
ss = StandardScaler()

In [79]:
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [80]:
pt = PowerTransformer()
y_train_pt = pt.fit_transform(y_train.to_frame())

In [81]:
model_lasso.fit(X_train_sc,y_train_pt)

  return f(*args, **kwargs)


LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]))

In [83]:
cross_val_score(model_lasso, X_train_sc,y_train_pt).mean()

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.8504705208402787

In [84]:
kag_pred = lasso_cv.predict(X_test_sc)

In [85]:
kag_pred_inverse =pt.inverse_transform(kag_pred.reshape(-1,1))

In [86]:
kag_subm = pd.DataFrame(kag_pred_inverse,index=test['Id'],columns=['SalePrice'])
kag_subm

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,137943.694223
2718,154568.710372
2414,222930.797561
1989,110343.091504
625,183005.159648
...,...
1662,196141.562393
1234,210463.987748
1373,123839.361877
1672,111896.319699


In [87]:
kag_subm.to_csv('../datasets/kaggle_submission_final.csv')