In [41]:
import pandas as pd
import numpy as np
import sklearn as sk

In [42]:
bostonData = pd.read_csv("train.csv")
bostonData.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [43]:
X = bostonData.drop(["SalePrice"], axis=1)
y = bostonData[["SalePrice"]]

In [44]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.1,random_state=1212121)

In [45]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1314, 80), (146, 80), (1314, 1), (146, 1))

# Processing of Training Set

In [46]:
bostonDataCategorical = x_train.select_dtypes(object)
bostonDataNumerical = x_train.select_dtypes(np.number)

In [47]:
bostonDataCategorical.isna().sum()

MSZoning            0
Street              0
Alley            1234
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          7
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           32
BsmtCond           32
BsmtExposure       33
BsmtFinType1       32
BsmtFinType2       33
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         0
Functional          0
FireplaceQu       621
GarageType         76
GarageFinish       76
GarageQual         76
GarageCond         76
PavedDrive          0
PoolQC           1308
Fence            1072
MiscFeature      1267
SaleType            0
SaleCondition       0
dtype: int64

In [48]:
bostonData.shape

(1460, 81)

In [49]:
bostonDataCategorical.drop(["MiscFeature", "Fence", "PoolQC", "Alley"], 
                           axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [50]:
from sklearn.impute import SimpleImputer
sImputer = SimpleImputer(strategy="most_frequent")

In [51]:
sImputer.fit(bostonDataCategorical)

SimpleImputer(strategy='most_frequent')

In [52]:
temp = sImputer.transform(bostonDataCategorical)
bostonDataCategoricalNARemoved = pd.DataFrame(temp, 
                                              columns=bostonDataCategorical.columns)

# Imputation of Numerical Columns

In [53]:
sImputerNumerical = SimpleImputer(strategy="mean")

In [54]:
sImputerNumerical.fit(bostonDataNumerical)

SimpleImputer()

In [55]:
temp = sImputerNumerical.transform(bostonDataNumerical)
bostonDataNumericalNARemoved = pd.DataFrame(temp,
                                            columns=bostonDataNumerical.columns)

# Data Processing

In [56]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [57]:
bostonDataCategoricalEncoded = bostonDataCategoricalNARemoved.apply(le.fit_transform)

In [58]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [59]:
ss.fit(bostonDataNumericalNARemoved)

StandardScaler()

In [60]:
temp = ss.transform(bostonDataNumericalNARemoved)
bostonDataNumericalEncoded = pd.DataFrame(temp,
                                          columns=bostonDataNumericalNARemoved.columns)

In [61]:
bostonDataNumericalEncoded.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,-1.601827,-0.876831,-0.323803,0.244667,-0.080784,-0.520904,1.079716,0.92271,-0.585898,1.067308,...,0.468344,-0.746343,0.027019,-0.360316,-0.1159,-0.27203,-0.066914,-0.085573,-0.47892,-1.374188
1,0.244221,0.079811,-1.185776,-0.098361,0.64627,0.378528,1.046625,0.874119,0.692495,0.073345,...,0.639498,0.77645,-0.136591,-0.360316,-0.1159,-0.27203,-0.066914,-0.085573,-1.956152,1.650406
2,-0.611848,1.514775,-1.321876,-0.508943,1.373324,-0.520904,0.947352,0.728346,-0.585898,1.620716,...,0.112159,0.015054,-0.002728,-0.360316,-0.1159,-0.27203,-0.066914,-0.085573,0.629004,-0.618039
3,-0.984884,0.079811,0.901105,-0.064097,0.64627,-0.520904,0.715714,0.38821,0.881674,1.167139,...,0.838407,1.950269,0.175756,2.048362,-0.1159,-0.27203,-0.066914,-0.085573,-0.848228,0.894258
4,0.036182,0.079811,0.0,-0.155305,1.373324,3.076827,0.153166,0.533982,1.070853,0.273006,...,1.647918,-0.746343,2.317558,-0.360316,-0.1159,-0.27203,-0.066914,-0.085573,0.259696,0.138109


In [62]:
bostonDataFinal = pd.concat([bostonDataCategoricalEncoded, bostonDataNumericalEncoded],
                            axis=1)

In [63]:
from sklearn.linear_model import LinearRegression, SGDRegressor
lr = LinearRegression()
sgdR = SGDRegressor(loss="squared_epsilon_insensitive",
                    penalty='elasticnet',
    alpha=0.005,
    l1_ratio=0.15,
    fit_intercept=True,
    max_iter=100,
    tol=0.001,
    shuffle=True,
    verbose=0,
    epsilon=0.1,
    random_state=None,
    learning_rate='optimal',
    eta0=0.01,
    power_t=0.25,
    early_stopping=False,
    validation_fraction=0.1,
    n_iter_no_change=50,
    warm_start=False,
    average=False)

In [64]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet

lasso=Lasso(alpha=0.5)
ridge=Ridge(alpha=0.5)
eNet = ElasticNet(l1_ratio=0.3)

In [65]:
lr.fit(bostonDataFinal.to_numpy(), y_train.to_numpy())

LinearRegression()

In [66]:
lasso.fit(bostonDataFinal.to_numpy(), y_train.to_numpy())

Lasso(alpha=0.5)

In [67]:
ridge.fit(bostonDataFinal.to_numpy(), y_train.to_numpy())

Ridge(alpha=0.5)

In [68]:
eNet.fit(bostonDataFinal.to_numpy(), y_train.to_numpy())

ElasticNet(l1_ratio=0.3)

In [69]:
sgdR.fit(bostonDataFinal.to_numpy(), y_train.to_numpy())

  return f(*args, **kwargs)


SGDRegressor(alpha=0.005, learning_rate='optimal',
             loss='squared_epsilon_insensitive', max_iter=100,
             n_iter_no_change=50, penalty='elasticnet')

# Preparation of Test Set

In [70]:
bostonDataCategorical = x_test.select_dtypes(object)
bostonDataNumerical = x_test.select_dtypes(np.number)

In [71]:
bostonDataCategorical.drop(["MiscFeature", "Fence", "PoolQC", "Alley"], 
                           axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [72]:
temp = sImputer.transform(bostonDataCategorical)
bostonDataCategoricalNARemoved = pd.DataFrame(temp, 
                                              columns=bostonDataCategorical.columns)

# Imputation of Numerical Columns

In [73]:
temp = sImputerNumerical.transform(bostonDataNumerical)
bostonDataNumericalNARemoved = pd.DataFrame(temp,
                                            columns=bostonDataNumerical.columns)

# Data Processing

In [74]:
bostonDataCategoricalEncoded = bostonDataCategoricalNARemoved.apply(le.fit_transform)

In [75]:
temp = ss.transform(bostonDataNumericalNARemoved)
bostonDataNumericalEncoded = pd.DataFrame(temp,
                                          columns=bostonDataNumericalNARemoved.columns)

In [76]:
bostonDataNumericalEncoded.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0.727254,-0.637671,-0.913574,-0.414036,-0.807838,1.277961,-1.104295,-1.701199,-0.585898,-0.974874,...,-1.085918,-0.746343,-0.716662,-0.360316,-0.1159,-0.27203,-0.066914,-0.085573,-0.109612,-0.618039
1,1.050074,0.079811,2.17138,2.453129,2.827431,-0.520904,0.781896,0.533982,7.313766,2.035229,...,1.5369,2.116825,0.413734,-0.360316,-0.1159,-0.27203,-0.066914,-0.085573,0.259696,-1.374188
2,1.571367,-0.159349,-0.459904,-0.198427,-0.080784,1.277961,-1.832299,-1.701199,-0.585898,-0.610276,...,-1.196936,-0.048396,-0.493558,-0.360316,-0.1159,-0.27203,-0.066914,-0.085573,0.629004,-0.618039
3,1.136159,-0.876831,0.901105,0.40158,0.64627,1.277961,0.616441,0.291028,0.755554,0.077685,...,1.046567,-0.080121,0.324492,-0.360316,-0.1159,-0.27203,-0.066914,-0.085573,-0.109612,0.894258
4,-1.54922,-0.876831,0.356702,-0.034311,1.373324,-0.520904,1.178989,1.068482,2.097007,-0.903256,...,1.310236,0.395752,0.755827,-0.360316,-0.1159,-0.27203,-0.066914,-0.085573,1.36762,0.138109


In [77]:
bostonDataTestFinal = pd.concat([bostonDataCategoricalEncoded, bostonDataNumericalEncoded],
                            axis=1)

In [78]:
predictions_lr = lr.predict(bostonDataTestFinal)
predictions_sgd = sgdR.predict(bostonDataTestFinal)

In [79]:
from sklearn.metrics import mean_squared_error, r2_score

In [80]:
np.sqrt(mean_squared_error(predictions_lr, y_test))

44448.46883003541

In [81]:
r2_score(predictions_lr, y_test)

0.6608426838173449

In [82]:
np.sqrt(mean_squared_error(predictions_sgd, y_test))

345300045917.16034

In [83]:
r2_score(predictions_sgd, y_test)

-4.159863025795548

In [84]:
predictions_lasso = lasso.predict(bostonDataTestFinal)
predictions_ridge = ridge.predict(bostonDataTestFinal)
predictions_enet = eNet.predict(bostonDataTestFinal)

In [85]:
np.sqrt(mean_squared_error(predictions_lasso, y_test))

39693.78362627439

In [86]:
np.sqrt(mean_squared_error(predictions_ridge, y_test))

39679.22917823129

In [87]:
np.sqrt(mean_squared_error(predictions_enet, y_test))

44829.99785977043