In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [None]:
pd.set_option("display.max_column",999)
pd.set_option("display.max_row",999)

### Loading Original Data

In [None]:
data=pd.read_excel("Original_Data.xlsx",header=3)

In [None]:
data.head()

#### Selecting features based on book description

In [None]:
#requires one-hot encoding
categorical=['Neighborhood','LotShape','LotConfig','Condition1','Condition2','BldgType']

#ordinal encoding
ordinal=['BsmtQual','ExterCond','KitchenQual']

#missing values
missing=['LotFrontage']

In [None]:
#no special treatment besides imputation etc
numerical_features=[
          'LotArea',
          'OverallQual',
        'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    'BsmtFinSF1',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
]

In [None]:
target="SalePrice"

In [None]:
#create a list to store all my features
features = ordinal + numerical_features + categorical +  missing
features

In [None]:
#separate data into features and target
X_raw = data[features].copy()
y_raw = data[target].copy()


In [None]:
#lets check our data
X_raw.head()

In [None]:
y_raw.head()

In [None]:
#quick statistics
X_raw.describe()

### Notice that categorical variables still are in their "raw" form - still text

### Checking for the null values

In [None]:
X_raw.isnull().mean()

In [None]:
#for the full dataset
data.isnull().mean()


### Converting the categorical / ordinal variables to numbers

#### Neighborhood - we need to convert to one-hot representation = creating dummy variables (0/1)

In [None]:
X_processed = pd.get_dummies(X_raw,columns=categorical,)
X_processed.head()

In [None]:
X_processed.shape

In [None]:
#dropping one of the categories
X_processed = pd.get_dummies(X_raw,columns=categorical,drop_first=True)
X_processed.shape

In [None]:
X_processed.head()

#### Basement quality - we will convert into a scale 1 - 5 (Poor - Excellent) and null values will be 0

In [None]:
X_processed[ordinal].value_counts()

       Ex	Excellent (100+ inches)	
       Gd	Good (90-99 inches)
       TA	Typical (80-89 inches)
       Fa	Fair (70-79 inches)
       Po	Poor (<70 inches
       NA	No Basement


In [None]:
ordinal_mappings = {
    
    "Ex":5,
    "Gd":4,
    "TA":3,
    "Fa":2,
    "Po":1,
    np.nan:0
    
}

In [None]:
#convert basement quality to our scale using the map dictionary from pandas
# for i in ordinal:
#     print(X_processed[i])
    # X_processed[i]=X_processed[i].map(ordinal_mappings)    
X_processed['BsmtQual']=X_processed['BsmtQual'].map(ordinal_mappings)
X_processed['ExterCond']=X_processed['ExterCond'].map(ordinal_mappings)
X_processed['KitchenQual']=X_processed['KitchenQual'].map(ordinal_mappings)
X_processed.head()

# X_processed[ordinal].describe()

In [None]:
X_raw.head() #noticed that we converted strings into numerical

### Imputing missing values with k-nearest-neighbour

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

X_processed[missing] = pd.DataFrame(imputer.fit_transform(X_processed[missing]),columns = missing)

X_processed.head()

Checking null values

In [None]:
X_processed.isnull().mean()

### Selecting features based on k-best

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

#Features as candidates to be added
featurelist = ['ExterCond','KitchenQual', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'Condition1_Feedr', 'Condition1_Norm', 'Condition1_PosA', 'Condition1_PosN', 'Condition1_RRAe', 'Condition1_RRAn', 'Condition1_RRNe', 'Condition1_RRNn', 'Condition2_Feedr', 'Condition2_Norm', 'Condition2_PosA', 'Condition2_PosN', 'Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'BldgType_2fmCon', 'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE']

searchData = X_processed[featurelist]

select = SelectKBest(score_func=f_regression, k=2)
z = select.fit_transform(searchData, y_raw) 
 # Get columns to keep and create new dataframe with those only
cols = select.get_support(indices=True)
features_df_new = searchData.iloc[:,cols]
features_df_new


selectedFeatureList = ['BsmtQual','KitchenQual','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','FullBath','HalfBath','BedroomAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','LotFrontage','Neighborhood_Blueste','Neighborhood_BrDale','Neighborhood_BrkSide','Neighborhood_ClearCr','Neighborhood_CollgCr','Neighborhood_Crawfor','Neighborhood_Edwards','Neighborhood_Gilbert','Neighborhood_IDOTRR','Neighborhood_MeadowV','Neighborhood_Mitchel','Neighborhood_NAmes','Neighborhood_NPkVill','Neighborhood_NWAmes','Neighborhood_NoRidge','Neighborhood_NridgHt','Neighborhood_OldTown','Neighborhood_SWISU','Neighborhood_Sawyer','Neighborhood_SawyerW','Neighborhood_Somerst','Neighborhood_StoneBr','Neighborhood_Timber','Neighborhood_Veenker','LotShape_IR2','LotShape_IR3','LotShape_Reg','LotConfig_CulDSac']

X_finalData = X_processed[selectedFeatureList]

X_finalData.head()

### Splitting into training and validation

In [None]:
#splitting data into training, validation and test
X_train, X_valid, X_test = X_finalData.iloc[0:1800,:].copy(),  X_finalData.iloc[1800:2400,:].copy(), X_finalData.iloc[2400:,:].copy()


In [None]:
#lets check the shape
X_train.shape,X_valid.shape, X_test.shape

In [None]:
#we do for the sale price
y_train, y_valid, y_test = y_raw[0:1800].copy(), y_raw[1800:2400].copy(), y_raw[2400:]
y_train.shape,y_valid.shape, y_test.shape

### Standardizing the variables

In [None]:
#we will use the sklearn module called StandardScaler
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
#fitting my scaler
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_train_scaled

In [None]:
#saving into a dataframe
X_train_scaled = pd.DataFrame(X_train_scaled.tolist(),columns=X_train.columns)
X_train_scaled.head()

In [None]:
#scaling validation and test
#saving into a dataframe
X_valid_scaled = pd.DataFrame(scaler.transform(X_valid).tolist(),columns = X_valid.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test).tolist(),columns = X_test.columns)



In [None]:
#scaling the target variable
y_train_scaled = (y_train - y_train.mean())/y_train.std()
y_valid_scaled = (y_valid - y_train.mean())/y_train.std()
y_test_scaled = (y_test - y_train.mean())/y_train.std()


### Lasso

#### Fitting a single model

In [None]:
#defining lambda term as 10% - we need to divide by two because the mean squared is divided by two
lasso = Lasso(alpha = 0.1/2)

In [None]:
#we fit the regression
lasso.fit(X_train_scaled,y_train_scaled)

In [None]:
#accessing the coefficients
lasso_coefs = pd.DataFrame()
lasso_coefs['variable']=  X_train.columns
lasso_coefs['beta']=  lasso.coef_
#appending the constant term
lasso_coefs = lasso_coefs.append({"variable":"constant", "beta":lasso.intercept_},ignore_index=True)
lasso_coefs

In [None]:
#computing the MSE 
#I need to generate my estimates
train_estimate = lasso.predict(X_train_scaled)
valid_estimate = lasso.predict(X_valid_scaled)
test_estimate = lasso.predict(X_test_scaled)


In [None]:
#importing mean squared error from sklearn
from sklearn.metrics import mean_squared_error


In [None]:
#compute the MSE
train_mse = mean_squared_error(y_train_scaled, train_estimate)
train_mse


In [None]:
valid_mse = mean_squared_error(y_valid_scaled, valid_estimate)
valid_mse

#### Repeat the same process for different levels of lambda

In [None]:
#generate the range of lambdas
lambdas = np.linspace(0.001, 1)/2

In [None]:
#create a results list to store my results
results = []

for l in lambdas:
    #define my model
    lasso = Lasso(alpha=l)
    
    #fit my model in the training set
    lasso.fit(X_train_scaled,y_train_scaled)
    
    #generate my estimate for the training and validation
    train_estimate = lasso.predict(X_train_scaled)
    valid_estimate = lasso.predict(X_valid_scaled)
    
    
    #finally we compute the MSE and store the results
    mse_train = mean_squared_error(y_train_scaled, train_estimate)
    mse_valid = mean_squared_error(y_valid_scaled, valid_estimate)
    
    results.append([l, mse_train, mse_valid])
    
    


In [None]:
#now convert the results into numpy so we can visualize
results = np.array(results)



In [None]:
#plotting training x validation

fig=plt.figure(figsize=(15,7.5))
plt.title("Lasso Performance")
plt.plot(results[:,0], #accessing lambda values
         results[:,1], #accessing train MSE
        label = 'Training MSE',
         color='black'
        )

plt.plot(results[:,0], #accessing lambda values
         results[:,2], #accessing valid MSE
        label = 'Validation MSE',
         color='red'
        )
plt.xlabel("Lambda")
plt.ylabel("MSE")
plt.legend(loc='best')
plt.show()