In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('houseprice.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.shape

(1460, 81)

In [5]:
df['Id'].nunique()

1460

In [6]:
df.drop('Id',axis=1,inplace=True)
df.shape

(1460, 80)

In [7]:
nv = df.isnull().sum()
nv[nv>0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [8]:
df.drop(['PoolQC','Alley','MiscFeature'],axis=1,inplace=True)
nv = df.isnull().sum()
nv[nv>0]

LotFrontage      259
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
Fence           1179
dtype: int64

In [9]:
cat_nan = [i for i in df.columns if df[i].isnull().sum()>0 and df[i].dtypes=='object']
num_nan = [i for i in df.columns if df[i].isnull().sum()>0 and df[i].dtypes!='object']
print(cat_nan)
print(len(cat_nan))
print(num_nan)
print(len(num_nan))

['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'Fence']
13
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
3


#### Basement null value Handling

In [10]:
bs_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for i in bs_cols:
    df[i].fillna('No_Base',inplace=True)
nv = df.isnull().sum()
nv[nv>0]

LotFrontage      259
MasVnrType         8
MasVnrArea         8
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
Fence           1179
dtype: int64

In [11]:
gar_cols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for i in gar_cols:
    df[i].fillna('No_Garage',inplace=True)
nv = df.isnull().sum()
nv[nv>0]

LotFrontage     259
MasVnrType        8
MasVnrArea        8
Electrical        1
FireplaceQu     690
GarageYrBlt      81
Fence          1179
dtype: int64

In [12]:
for i in ['LotFrontage','MasVnrArea']:
    df[i].fillna(df[i].mean(),inplace=True)
nv = df.isnull().sum()
nv[nv>0]

MasVnrType        8
Electrical        1
FireplaceQu     690
GarageYrBlt      81
Fence          1179
dtype: int64

In [13]:
nv = df.isnull().sum()
nv[nv>0]

MasVnrType        8
Electrical        1
FireplaceQu     690
GarageYrBlt      81
Fence          1179
dtype: int64

In [14]:
for i in nv[nv>0].index:
    print(df[i].value_counts())

None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64
2005.0    65
2006.0    59
2004.0    53
2003.0    50
2007.0    49
          ..
1906.0     1
1908.0     1
1933.0     1
1900.0     1
1927.0     1
Name: GarageYrBlt, Length: 97, dtype: int64
MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: Fence, dtype: int64


In [15]:
nv[nv>0].index

Index(['MasVnrType', 'Electrical', 'FireplaceQu', 'GarageYrBlt', 'Fence'], dtype='object')

In [16]:
for i in ['MasVnrType', 'Electrical', 'FireplaceQu', 'Fence']:
    x = df[i].mode().max()
    df[i].fillna(x,inplace=True)

In [17]:
nv = df.isnull().sum()
nv[nv>0]

GarageYrBlt    81
dtype: int64

In [18]:
df['GarageYrBlt'].fillna(2005.0,inplace=True)
nv = df.isnull().sum()
nv[nv>0]

Series([], dtype: int64)

In [19]:
df.shape

(1460, 77)

### Outlier Handling

In [20]:
df.describe(percentiles=[0.97,0.98,0.99]).T

Unnamed: 0,count,mean,std,min,50%,97%,98%,99%,max
MSSubClass,1460.0,56.89726,42.300571,20.0,50.0,160.0,188.2,190.0,190.0
LotFrontage,1460.0,70.049958,22.024023,21.0,70.049958,114.0,120.82,137.41,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,9478.5,21571.8,25251.62,37567.64,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,6.0,9.0,9.0,10.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,8.0,8.0,9.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1973.0,2007.0,2008.0,2009.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1994.0,2008.0,2008.0,2009.0,2010.0
MasVnrArea,1460.0,103.685262,180.569112,0.0,0.0,573.69,650.82,791.28,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,383.5,1375.99,1442.64,1572.41,5644.0
BsmtFinSF2,1460.0,46.549315,161.319273,0.0,0.0,546.23,658.12,830.38,1474.0


In [21]:
df.describe(percentiles=[0.97,0.98,0.99]).T.index

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [22]:
out_cols_98 = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2',
               'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
              'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath',
              'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
               'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
              'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
               'SalePrice']
print(out_cols_98)

['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']


In [23]:
def oh1(x):
    x = x.clip(upper=x.quantile(0.98))
    return x

In [24]:
df[out_cols_98] = df[out_cols_98].apply(oh1)

In [25]:
df.describe(percentiles=[0.01,0.02,0.03,0.05]).T

Unnamed: 0,count,mean,std,min,1%,2%,3%,5%,50%,max
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,20.0,20.0,20.0,50.0,190.0
LotFrontage,1460.0,69.405164,18.845067,21.0,21.0,24.0,30.0,35.95,70.049958,120.82
LotArea,1460.0,9884.318219,4269.103743,1300.0,1680.0,2124.74,2522.0,3311.7,9478.5,25251.62
OverallQual,1460.0,6.099315,1.382997,1.0,3.0,4.0,4.0,4.0,6.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,3.0,3.0,4.0,4.0,5.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1899.18,1908.36,1910.0,1916.0,1973.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1950.0,1950.0,1950.0,1950.0,1994.0,2010.0
MasVnrArea,1460.0,99.054851,159.552455,0.0,0.0,0.0,0.0,0.0,0.0,650.82
BsmtFinSF1,1460.0,436.678219,424.424497,0.0,0.0,0.0,0.0,0.0,383.5,1442.64
BsmtFinSF2,1460.0,41.938082,136.258146,0.0,0.0,0.0,0.0,0.0,0.0,658.12


In [26]:
out_cols_01 = ['LotArea', 'OverallQual', 'OverallCond','1stFlrSF','TotRmsAbvGrd',
                'GrLivArea','SalePrice']
print(out_cols_01)

['LotArea', 'OverallQual', 'OverallCond', '1stFlrSF', 'TotRmsAbvGrd', 'GrLivArea', 'SalePrice']


In [27]:
def oh2(x):
    x = x.clip(lower=x.quantile(0.01))
    return x

In [28]:
df[out_cols_01] = df[out_cols_01].apply(oh2)

In [29]:
df.shape

(1460, 77)

In [30]:
df1 = df.copy()
df1.shape

(1460, 77)

In [31]:
df.to_csv('hprice_after_preprocess.csv')

In [32]:
cat_cols = [i for i in df.columns if df[i].dtypes=='object']
print(len(cat_cols))

40


### Approach -1 Label Encoder

In [33]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
for i in cat_cols:
    df1[i] = lb.fit_transform(df1[i])
df1.dtypes.value_counts()

int32      40
int64      19
float64    18
dtype: int64

In [34]:
x = df1.drop('SalePrice',axis=1)
y = df1['SalePrice']
print(x.shape)
print(y.shape)

(1460, 76)
(1460,)


In [35]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1095, 76)
(365, 76)
(1095,)
(365,)


In [36]:
from sklearn.linear_model import LinearRegression

In [37]:
m1 = LinearRegression()
m1.fit(x_train,y_train)
print('Training_score',m1.score(x_train,y_train))
print('Testing_score',m1.score(x_test,y_test))

Training_score 0.8911145142398861
Testing_score 0.8924731252870297


In [38]:
from sklearn.preprocessing import StandardScaler

In [39]:
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train) 
x_test_sc = sc.transform(x_test) 

In [40]:
m2 = LinearRegression()
m2.fit(x_train_sc,y_train)
print('Training_score',m2.score(x_train_sc,y_train))
print('Testing_score',m2.score(x_test_sc,y_test))

Training_score 0.8911140277094899
Testing_score 0.8923961051508513


### PCA

In [41]:
from sklearn.decomposition import PCA

In [42]:
pca = PCA()
pc_x_train = pca.fit_transform(x_train_sc)
pc_x_test = pca.transform(x_test_sc)
print(pc_x_train.shape)
print(pc_x_test.shape)

(1095, 76)
(365, 76)


In [43]:
eig_val = pd.DataFrame({'Var':pca.explained_variance_})
eig_val.head()

Unnamed: 0,Var
0,10.397402
1,4.193391
2,3.853379
3,3.012935
4,2.438278


In [44]:
eig_val['Contri'] = eig_val['Var']*100/eig_val['Var'].sum()
eig_val['CumSum'] = eig_val['Contri'].cumsum()
eig_val.head()

Unnamed: 0,Var,Contri,CumSum
0,10.397402,14.230009,14.230009
1,4.193391,5.739126,19.969135
2,3.853379,5.27378,25.242916
3,3.012935,4.123539,29.366455
4,2.438278,3.337056,32.703511


In [45]:
eig_val[eig_val['CumSum']<80].tail(7)

Unnamed: 0,Var,Contri,CumSum
27,0.872549,1.194181,73.148644
28,0.850739,1.164332,74.312976
29,0.839748,1.149289,75.462266
30,0.816632,1.117653,76.579919
31,0.793977,1.086646,77.666565
32,0.777824,1.064539,78.731104
33,0.771025,1.055234,79.786338


In [46]:
pc_train = pd.DataFrame(pc_x_train)
pc_train = pc_train.iloc[:,:30]
print(pc_train.shape)
pc_train.head()

(1095, 30)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,1.775512,-1.120963,-3.899236,-0.942014,0.480689,-1.150501,1.178268,-2.797053,0.308599,1.360914,...,-0.698774,-0.112402,-0.841262,0.613597,0.3212,-0.904277,-0.117512,0.581248,-0.560278,-0.670137
1,-0.277394,-2.438229,2.847089,1.369318,-0.042831,0.013838,1.215967,1.086225,-2.129894,0.699209,...,-0.112301,0.30947,-0.198502,1.303387,0.977496,0.10745,1.286202,1.62057,-1.011773,0.157493
2,-4.436284,0.22688,-0.540933,-1.072174,-1.467458,0.506372,-0.216647,-1.070789,-0.123326,-1.263389,...,-0.031833,0.164746,-0.128344,0.113619,-0.431887,0.921287,-0.96161,-0.083733,0.971113,0.399954
3,-1.940067,0.54674,1.71469,-1.30959,-2.35333,-0.328687,0.293791,-1.738255,0.847541,0.133178,...,2.606353,0.864105,-1.624376,0.905932,-0.5095,1.383277,1.260139,-0.514489,-0.226077,-0.236176
4,-1.734926,0.615985,1.756388,-0.655103,-1.487885,-2.519024,-0.710291,0.816704,-0.496037,0.657277,...,2.841511,-0.524341,-1.740184,2.006901,0.52989,-3.20009,-1.753232,-1.323015,0.400313,-1.629678


In [47]:
pc_test = pd.DataFrame(pc_x_test)
pc_test = pc_test.iloc[:,:30]
print(pc_test.shape)
pc_test.head()

(365, 30)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-2.253183,-1.755739,1.398885,0.214591,-1.095537,-1.050384,-0.439589,0.200878,-1.814528,1.145812,...,0.636903,-1.231758,1.157171,-1.761325,0.575905,0.10115,0.275057,1.100106,-1.119435,1.386336
1,5.433615,0.819641,1.784858,2.359093,1.044885,-0.019221,-1.651985,0.553032,0.321985,0.187745,...,-0.609591,-0.577769,0.575049,-0.459922,0.075502,0.013124,-0.187036,-0.253041,0.605781,-0.042986
2,-4.293985,0.377564,0.140636,-2.743291,-1.672841,-0.702767,0.16314,-1.40215,-0.776163,-1.182737,...,-1.144395,-0.739518,1.177116,-0.550151,0.751055,-0.156152,-0.09431,1.082695,0.884629,-0.796851
3,-2.459562,0.715935,1.163336,0.580733,-0.482541,-1.014272,-1.096407,-1.371946,-1.599506,0.032905,...,-1.109246,0.010694,-0.199266,1.059216,-0.750213,-1.615768,-0.396103,0.933636,-1.214415,-0.00967
4,5.527602,-2.233682,0.172885,-3.353039,2.260897,1.343908,-0.249103,0.592315,-0.697571,-1.423487,...,-0.397284,0.18256,0.210254,-0.120837,-0.787927,1.698902,-0.177744,0.443724,0.671456,-0.409809


In [48]:
print(pc_train.shape)
print(pc_test.shape)
print(y_train.shape)
print(y_test.shape)

(1095, 30)
(365, 30)
(1095,)
(365,)


In [49]:
m3 = LinearRegression()
m3.fit(pc_train,y_train)
print('Training_score',m3.score(pc_train,y_train))
print('Testing_score',m3.score(pc_test,y_test))

Training_score 0.8660192236561031
Testing_score 0.8985840149911961


In [50]:
ypred_m3 = m3.predict(pc_test)

In [51]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [52]:
def gen_matrix(ytest,ypred):
    mae = mean_absolute_error(ytest,ypred)
    mse = mean_squared_error(ytest,ypred)
    rmse = np.sqrt(mean_squared_error(ytest,ypred))
    r2s = r2_score(ytest,ypred)
    print('MAE',mae)
    print('MSE',mse)
    print('RMSE',rmse)
    print('R2_score',r2s)

In [53]:
gen_matrix(y_test,ypred_m3)

MAE 17094.59910322012
MSE 536435890.6688689
RMSE 23161.085697109902
R2_score 0.8985840149911961


### Approach - 2 Get_Dummies

In [56]:
cat_cols = [i for i in df.columns if df[i].dtypes=='object']
print(len(cat_cols))
print(cat_cols)

40
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'SaleType', 'SaleCondition']


In [57]:
df1 = pd.get_dummies(data=df,columns=cat_cols,drop_first=True)
df1.shape

(1460, 249)

In [60]:
x1 = df1.drop('SalePrice',axis=1)
y1 = df1['SalePrice']
print(x1.shape)
print(y1.shape)

(1460, 248)
(1460,)


In [61]:
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1 = train_test_split(x1,y1,test_size=0.25,random_state=10)
print(x_train1.shape)
print(x_test1.shape)
print(y_train1.shape)
print(y_test1.shape)

(1095, 248)
(365, 248)
(1095,)
(365,)


In [63]:
m4 = LinearRegression()
m4.fit(x_train1,y_train1)
print('Training_score',m4.score(x_train1,y_train1))
print('Testing_score',m4.score(x_test1,y_test1))
# overfit model

Training_score 0.9450224851961514
Testing_score 0.7849927974463277


### PCA

In [64]:
sc1 = StandardScaler()
x_train_sc1 = sc1.fit_transform(x_train1) 
x_test_sc1 = sc1.transform(x_test1) 

In [66]:
pca1 = PCA()
x_train_pc1 = pca1.fit_transform(x_train_sc1)
x_test_pc1 = pca1.transform(x_test_sc1)
print(x_train_pc1.shape)
print(x_test_pc1.shape)

(1095, 248)
(365, 248)


In [67]:
eig_val1 = pd.DataFrame({'Var':pca1.explained_variance_})
eig_val1.head()

Unnamed: 0,Var
0,16.922308
1,8.051164
2,7.072466
3,6.034808
4,5.677673


In [68]:
eig_val1['Contri'] = eig_val1['Var']*100/eig_val1['Var'].sum()
eig_val1.head()

Unnamed: 0,Var,Contri
0,16.922308,7.10372
1,8.051164,3.379753
2,7.072466,2.968911
3,6.034808,2.533318
4,5.677673,2.383398


In [69]:
eig_val1['CumSum'] = eig_val1['Contri'].cumsum()
eig_val1.tail()

Unnamed: 0,Var,Contri,CumSum
243,6.705258e-32,2.814762e-32,100.0
244,6.705258e-32,2.814762e-32,100.0
245,6.705258e-32,2.814762e-32,100.0
246,6.705258e-32,2.814762e-32,100.0
247,6.705258e-32,2.814762e-32,100.0


In [70]:
eig_val1[eig_val1['CumSum']<76].tail()

Unnamed: 0,Var,Contri,CumSum
76,1.067135,0.447966,73.989097
77,1.044976,0.438664,74.427761
78,1.031283,0.432916,74.860678
79,1.011247,0.424506,75.285184
80,1.007534,0.422947,75.70813


In [71]:
train_res_df = pd.DataFrame(x_train_pc1)
train_res_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,238,239,240,241,242,243,244,245,246,247
0,-4.62296,1.161616,-0.252797,-1.672862,0.419907,0.980594,-2.24249,0.237645,0.39626,-1.035301,...,-1.22672e-16,-3.482708e-16,4.205962e-16,-8.691376e-18,-2.613988e-16,-1.722563e-16,8.364086e-16,-2.846229e-16,-1.70348e-16,-2.095634e-16
1,3.460182,3.193334,-1.538067,4.954354,-2.296506,0.36251,1.13192,-1.122479,-0.440176,0.062252,...,1.402864e-16,-1.676307e-16,2.707868e-16,1.442709e-16,-1.72423e-16,-2.201656e-16,-2.627042e-16,2.857227e-16,7.390968000000001e-17,1.600279e-16
2,-3.727533,0.948955,-0.117145,0.794336,-1.013626,-2.305109,0.0721,2.238317,0.208896,-1.608321,...,-3.20335e-16,4.618181e-16,8.253815e-16,4.619823e-16,8.164827000000001e-17,-6.797216e-18,1.905207e-16,3.029634e-16,3.960326e-16,2.560199e-16
3,-2.256559,0.127604,0.189685,-0.391265,-0.363494,-3.290996,3.4784,2.377569,1.168006,-1.572199,...,-5.2147060000000006e-17,-1.133144e-16,-1.800913e-16,-5.105007e-16,1.353171e-16,2.863394e-16,-5.075837e-16,-2.409411e-16,-5.198902e-16,-2.851201e-16
4,0.161543,-5.091692,1.469638,-1.366211,0.164836,-1.560062,-1.082986,-2.200951,1.201501,-1.852551,...,4.806226e-16,1.523713e-16,-6.399283e-17,1.438454e-16,-1.274533e-16,-9.886382000000001e-17,-3.605448e-16,3.223429e-16,-2.686701e-16,-1.981464e-16


In [72]:
pc_train1 = train_res_df.iloc[:,:80]
print(pc_train1.shape)
pc_train1.head()

(1095, 80)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,-4.62296,1.161616,-0.252797,-1.672862,0.419907,0.980594,-2.24249,0.237645,0.39626,-1.035301,...,0.138666,-0.3089,0.38996,-0.118849,0.438211,-0.278563,0.107132,-0.458338,0.285253,0.356943
1,3.460182,3.193334,-1.538067,4.954354,-2.296506,0.36251,1.13192,-1.122479,-0.440176,0.062252,...,-0.238987,-0.341045,-0.502188,0.115955,0.87162,-0.597209,-1.49128,0.247173,0.23391,-1.131874
2,-3.727533,0.948955,-0.117145,0.794336,-1.013626,-2.305109,0.0721,2.238317,0.208896,-1.608321,...,-0.56284,0.340594,-0.892724,0.170697,0.020227,0.311635,-0.447642,0.027376,-0.414009,-0.0297
3,-2.256559,0.127604,0.189685,-0.391265,-0.363494,-3.290996,3.4784,2.377569,1.168006,-1.572199,...,-1.605024,2.15466,1.541211,0.554874,1.554105,-1.334064,-1.285356,0.041283,0.915574,-0.394873
4,0.161543,-5.091692,1.469638,-1.366211,0.164836,-1.560062,-1.082986,-2.200951,1.201501,-1.852551,...,-0.649731,-1.267377,1.383861,0.444362,-0.004712,0.975507,0.795394,0.796413,-0.254571,-0.820256


In [73]:
test_res_df = pd.DataFrame(x_test_pc1)
pc_test1 = test_res_df.iloc[:,:80]
print(pc_test1.shape)
pc_test1.head()

(365, 80)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,1.163742,-4.973318,2.064422,0.576793,1.935019,1.702125,-0.09584,-0.93195,-0.768576,1.833031,...,0.718719,2.282772,-0.378319,-0.02568,-0.74873,0.271218,-0.240627,-0.555689,0.770248,0.638118
1,-4.986402,4.10171,-1.155137,-2.66045,-0.436282,3.628721,-0.644896,0.081258,-1.088594,3.242532,...,-0.652539,-0.130129,-0.011384,0.754219,-0.528381,0.066935,0.408798,-0.011951,-0.331926,-0.860567
2,3.048258,-2.9857,0.088437,-1.661596,-0.042465,1.481756,-2.355095,0.738797,-0.245938,-0.650122,...,0.18457,-0.230052,-0.381346,0.870068,0.566271,0.277971,0.641269,-0.564385,-0.753599,-0.128855
3,0.28773,-3.827212,1.177659,-0.556239,0.426184,-0.521347,-1.034792,-0.265599,0.345889,0.619083,...,1.985976,-1.019392,1.15105,-1.45541,0.260457,-1.06486,0.428149,0.892203,-1.8215,1.949897
4,-5.780792,3.047428,0.179937,3.474932,-0.604579,-0.339707,-1.593825,-1.066398,-0.650316,-0.013347,...,1.382331,0.037547,-0.712485,-0.294459,0.442253,0.079272,-0.007594,0.224012,-0.549256,0.827346


In [74]:
print(pc_train1.shape)
print(pc_test1.shape)
print(y_train1.shape)
print(y_test1.shape)

(1095, 80)
(365, 80)
(1095,)
(365,)


In [75]:
from sklearn.linear_model import LinearRegression

In [76]:
m5 = LinearRegression()
m5.fit(pc_train1,y_train1)
print('Training_score',m5.score(pc_train1,y_train1))
print('Testing_score',m5.score(pc_test1,y_test1))

Training_score 0.8828115298232833
Testing_score 0.882979874437865


In [81]:
ypred_m5 = m5.predict(pc_test1)

In [82]:
gen_matrix(y_test1,ypred_m5)

MAE 18906.614838694808
MSE 632663026.5847511
RMSE 25152.793613925893
R2_score 0.882979874437865


In [88]:
print(pc_train1.shape)
print(type(pc_train1))

(1095, 80)
<class 'pandas.core.frame.DataFrame'>


In [87]:
t = []
s = 'PC'                             # PC = Principal component
for i in range(pc_train1.shape[1]):  # x_reg.shape =(1095, 80)  # i=0,1,2,3,.....,80
    d = s + str(i)
    t.append(d)
print(t)

['PC0', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40', 'PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46', 'PC47', 'PC48', 'PC49', 'PC50', 'PC51', 'PC52', 'PC53', 'PC54', 'PC55', 'PC56', 'PC57', 'PC58', 'PC59', 'PC60', 'PC61', 'PC62', 'PC63', 'PC64', 'PC65', 'PC66', 'PC67', 'PC68', 'PC69', 'PC70', 'PC71', 'PC72', 'PC73', 'PC74', 'PC75', 'PC76', 'PC77', 'PC78', 'PC79']


In [90]:
pc_train1.columns = t
pc_train1.head()

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC70,PC71,PC72,PC73,PC74,PC75,PC76,PC77,PC78,PC79
0,-4.62296,1.161616,-0.252797,-1.672862,0.419907,0.980594,-2.24249,0.237645,0.39626,-1.035301,...,0.138666,-0.3089,0.38996,-0.118849,0.438211,-0.278563,0.107132,-0.458338,0.285253,0.356943
1,3.460182,3.193334,-1.538067,4.954354,-2.296506,0.36251,1.13192,-1.122479,-0.440176,0.062252,...,-0.238987,-0.341045,-0.502188,0.115955,0.87162,-0.597209,-1.49128,0.247173,0.23391,-1.131874
2,-3.727533,0.948955,-0.117145,0.794336,-1.013626,-2.305109,0.0721,2.238317,0.208896,-1.608321,...,-0.56284,0.340594,-0.892724,0.170697,0.020227,0.311635,-0.447642,0.027376,-0.414009,-0.0297
3,-2.256559,0.127604,0.189685,-0.391265,-0.363494,-3.290996,3.4784,2.377569,1.168006,-1.572199,...,-1.605024,2.15466,1.541211,0.554874,1.554105,-1.334064,-1.285356,0.041283,0.915574,-0.394873
4,0.161543,-5.091692,1.469638,-1.366211,0.164836,-1.560062,-1.082986,-2.200951,1.201501,-1.852551,...,-0.649731,-1.267377,1.383861,0.444362,-0.004712,0.975507,0.795394,0.796413,-0.254571,-0.820256


In [91]:
print(pc_test1.shape)
print(type(pc_test1))

(365, 80)
<class 'pandas.core.frame.DataFrame'>


In [92]:
t1 = []
s = 'PC'                         # PC = Principal component
for i in range(pc_test1.shape[1]):  # pc_test1.shape =(365, 80)  # i=0,1,2,3,.....,80
    d = s + str(i)
    t1.append(d)
print(t1)

['PC0', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40', 'PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46', 'PC47', 'PC48', 'PC49', 'PC50', 'PC51', 'PC52', 'PC53', 'PC54', 'PC55', 'PC56', 'PC57', 'PC58', 'PC59', 'PC60', 'PC61', 'PC62', 'PC63', 'PC64', 'PC65', 'PC66', 'PC67', 'PC68', 'PC69', 'PC70', 'PC71', 'PC72', 'PC73', 'PC74', 'PC75', 'PC76', 'PC77', 'PC78', 'PC79']


In [94]:
pc_test1.columns = t1
pc_test1.head()

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC70,PC71,PC72,PC73,PC74,PC75,PC76,PC77,PC78,PC79
0,1.163742,-4.973318,2.064422,0.576793,1.935019,1.702125,-0.09584,-0.93195,-0.768576,1.833031,...,0.718719,2.282772,-0.378319,-0.02568,-0.74873,0.271218,-0.240627,-0.555689,0.770248,0.638118
1,-4.986402,4.10171,-1.155137,-2.66045,-0.436282,3.628721,-0.644896,0.081258,-1.088594,3.242532,...,-0.652539,-0.130129,-0.011384,0.754219,-0.528381,0.066935,0.408798,-0.011951,-0.331926,-0.860567
2,3.048258,-2.9857,0.088437,-1.661596,-0.042465,1.481756,-2.355095,0.738797,-0.245938,-0.650122,...,0.18457,-0.230052,-0.381346,0.870068,0.566271,0.277971,0.641269,-0.564385,-0.753599,-0.128855
3,0.28773,-3.827212,1.177659,-0.556239,0.426184,-0.521347,-1.034792,-0.265599,0.345889,0.619083,...,1.985976,-1.019392,1.15105,-1.45541,0.260457,-1.06486,0.428149,0.892203,-1.8215,1.949897
4,-5.780792,3.047428,0.179937,3.474932,-0.604579,-0.339707,-1.593825,-1.066398,-0.650316,-0.013347,...,1.382331,0.037547,-0.712485,-0.294459,0.442253,0.079272,-0.007594,0.224012,-0.549256,0.827346
