In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [2]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer, StandardScaler
from sklearn.dummy import DummyRegressor

In [3]:
df_train = pd.read_csv('../datasets/complete_clean_train_extra.csv')
df_test = pd.read_csv('../datasets/complete_clean_test_extra.csv')

In [4]:
features = list(df_train.corr()[['SalePrice']].sort_values(by='SalePrice',ascending=False).index)

In [5]:
features_not_req = ['Id','Condition 2', 'Central Air', 'P', 'Y','SalePrice']

In [6]:
features = [feature for feature in features if feature not in features_not_req]

In [7]:
features

['Overall Qual',
 'Gr Liv Area',
 'Garage Area',
 'Garage Cars',
 'Total Bsmt SF',
 '1st Flr SF',
 'Year Built',
 'Year Remod/Add',
 'Full Bath',
 'TotRms AbvGrd',
 'Mas Vnr Area',
 'Fireplaces',
 'BsmtFin SF 1',
 'Lot Area',
 'Open Porch SF',
 'Wood Deck SF',
 'Lot Frontage',
 'Bsmt Full Bath',
 'Half Bath',
 'Garage Yr Blt',
 '2nd Flr SF',
 'Bsmt Unf SF',
 'Bedroom AbvGr',
 'Screen Porch',
 'Bsmt Exposure',
 'Bldg Type',
 'Condition 1',
 '3Ssn Porch',
 'Mo Sold',
 'Pool Area',
 'BsmtFin SF 2',
 'Bsmt Cond',
 'Misc Val',
 'NoSeWa',
 'Yr Sold',
 'Garage Cond',
 'Electrical',
 'NoSewr',
 'Low Qual Fin SF',
 'Bsmt Half Bath',
 'Remodel',
 'Street',
 'MS SubClass',
 'BsmtFin Type 1',
 'Overall Cond',
 'Kitchen AbvGr',
 'Enclosed Porch',
 'PID',
 'Bsmt Qual',
 'Kitchen Qual',
 'Age']

# Part I

The goal is to run all the features through the lasso regression model, and identify the features that have an effect on SalePrice. The lasso regression helps in getting rid of the features that do not affect the SalePrice. This will help in reducing the complexity of the model, thereby, reducing the variance. 

## Model Prep

In [8]:
X_train = df_train[features]
y_train = df_train['SalePrice']

In [9]:
X_test = df_test[features]

## Scaling Features

In [10]:
ss = StandardScaler()

In [11]:
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

## Power Transforming the y

As mentioned before, the target variable 'SalePrice' is not normally distributed, it is skewed to the right. The 'SalePrice' variable is tehere being power transformed. This will make the distribution of the variable more Gaussian. In other words, this will stabilize the variance in target variable. 

In [12]:
pt = PowerTransformer()

In [13]:
y_train_pt = pt.fit_transform(y_train.to_frame())

## Instantiating the Model

In [14]:
l_alphas = np.logspace(-3,0,100)

In [15]:
lasso_cv = LassoCV(alphas = l_alphas)

## Model Validation

In [16]:
cross_val_score(lasso_cv,X_train_sc,y_train_pt).mean()

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.8467102719505715

## Fitting the Model

In [17]:
lasso_cv.fit(X_train_sc,y_train_pt)

  return f(*args, **kwargs)


LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]))

In [18]:
pred = lasso_cv.predict(X_test_sc)

In [19]:
pred_reversed = pt.inverse_transform(pred.reshape(-1,1))

In [20]:
pred_df = pd.DataFrame(data = pred_reversed,index = df_test['Id'],columns=['SalePrice'])
pred_df

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,142424.257692
2718,156432.463906
2414,213116.155485
1989,114144.209991
625,179012.202727
...,...
1662,194069.873947
1234,202807.792013
1373,125588.136255
1672,113790.119240


In [21]:
pred_df.to_csv('../datasets/kaggle_submission_copy.csv')

The Kaggle Score for this model came up to be 24492.29. This score is the Root Mean Square Error.