## House-Prices-Advanced-Regression-Techniques

### Made By: Gaurav Baweja

In [1]:
import cudf as pd
import cupy as np
import cuml as ml
from cuml.preprocessing.LabelEncoder import LabelEncoder
from cuml.model_selection import train_test_split
from cuml.metrics.regression import mean_squared_error,mean_absolute_error,r2_score

In [2]:
TRAIN_PATH = "../input/house-prices-advanced-regression-techniques/train.csv"
TEST_PATH = "../input/house-prices-advanced-regression-techniques/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/house-prices-advanced-regression-techniques/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"
ID = "Id"
TARGET = "SalePrice"
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [4]:
df = pd.concat([train.iloc[:,:-1],test]).reset_index(drop=True)
Y=train.iloc[:,-1].copy()
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [5]:
def Preprocessing1(df,cols=1):
    if cols>0:
        df_cp = df.copy()
        pca = ml.PCA(n_components=cols,output_type='cudf')
        pca.fit(df_cp)
        df_cp = pca.transform(df_cp)
        return df_cp
    return df

In [6]:
def checkNull_fillData(df):
    na_cols=[]
    for col in df.columns:
        if df[col].isna().sum()>0.2*len(df):
            na_cols.append(col)
    df = df.drop(na_cols,axis=1)
    na_cols=[]
    for col in df.select_dtypes(exclude='object').columns:
        if(df[col]==0).sum()>0.2*len(df):
            na_cols.append(col)
    df=df.drop(na_cols,axis=1)
    df=df.drop('Id',axis=1)
    for col in df.select_dtypes(include='object').columns:
        df[col]=df[col].fillna(df[col].mode()[0],inplace=False)
    for col in df.select_dtypes(exclude='object').columns:
        df[col]=df[col].fillna(df[col].mean(),inplace=False)
    return df
df = checkNull_fillData(df)

In [7]:
#Encodeing
def encodeData(df):
    for col in df.select_dtypes(include='object').columns:
        encode = LabelEncoder()
        df[col]=pd.Series(encode.fit_transform(df[col]))
    return df
df = encodeData(df)
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,3,3,0,4,0,...,1,2.0,548.0,4,4,2,2,2008,8,4
1,20,3,80.0,9600,1,3,3,0,2,0,...,1,2.0,460.0,4,4,2,5,2007,8,4
2,60,3,68.0,11250,1,0,3,0,4,0,...,1,2.0,608.0,4,4,2,9,2008,8,4
3,70,3,60.0,9550,1,0,3,0,0,0,...,2,3.0,642.0,4,4,2,2,2006,8,0
4,60,3,84.0,14260,1,0,3,0,2,0,...,1,3.0,836.0,4,4,2,12,2008,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,4,21.0,1936,1,3,3,0,4,0,...,2,0.0,0.0,4,4,2,6,2006,8,4
2915,160,4,21.0,1894,1,3,3,0,4,0,...,2,1.0,286.0,4,4,2,4,2006,8,0
2916,20,3,160.0,20000,1,3,3,0,4,0,...,2,2.0,576.0,4,4,2,9,2006,8,0
2917,85,3,62.0,10441,1,3,3,0,4,0,...,2,0.0,0.0,4,4,2,7,2006,8,4


In [8]:
#Scaling the Data ( YEO-JOHNSON TRANFORMATION(0),Z-Score)
for col in df.columns:
    if 0 in df[col].unique():
        a = df[col].mean()
        b = df[col].std()
        df[col]=df[col].applymap(lambda x:(x-a)/b)
    else:
        encode = PowerTransform()
        encode.fit(df[col].values.get().reshape(-1,-1))
        a=encode.transform(df[col].values.get().reshape(-1,-1))
        df[col]=pd.Series(a.flatten())
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,MoSold,YrSold,SaleType,SaleCondition
0,0.067320,-0.042149,-0.202033,197.397238,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-3.080024,-3.158066,22.460888,1.476788,1.476788,-1.561087,-1.296820,45.883611,7.552538,1.476788
1,-0.873466,-0.042149,0.501785,224.444843,-3.080024,-0.042149,-0.042149,-4.598962,-1.561087,-4.598962,...,-3.080024,-3.158066,18.331826,1.476788,1.476788,-1.561087,-1.226261,45.860091,7.552538,1.476788
2,0.067320,-0.042149,-0.061269,263.252276,-3.080024,-4.598962,-0.042149,-4.598962,1.476788,-4.598962,...,-3.080024,-3.158066,25.276158,1.476788,1.476788,-1.561087,-1.132183,45.883611,7.552538,1.476788
3,0.302516,-0.042149,-0.436639,223.268860,-3.080024,-4.598962,-0.042149,-4.598962,-4.598962,-4.598962,...,-1.561087,-3.111145,26.871477,1.476788,1.476788,-1.561087,-1.296820,45.836571,7.552538,-4.598962
4,0.067320,-0.042149,0.689469,334.046442,-3.080024,-4.598962,-0.042149,-4.598962,-1.561087,-4.598962,...,-3.080024,-3.111145,35.974182,1.476788,1.476788,-1.561087,-1.061624,45.883611,7.552538,1.476788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2.419286,1.476788,-2.266564,44.190195,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-1.561087,-3.251908,-3.251908,1.476788,1.476788,-1.561087,-1.202742,45.836571,7.552538,1.476788
2915,2.419286,1.476788,-2.266564,43.202370,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-1.561087,-3.204987,10.167544,1.476788,1.476788,-1.561087,-1.249781,45.836571,7.552538,-4.598962
2916,-0.873466,-0.042149,4.255477,469.049270,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-1.561087,-3.158066,23.774681,1.476788,1.476788,-1.561087,-1.132183,45.836571,7.552538,-4.598962
2917,0.655311,-0.042149,-0.342796,244.224874,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-1.561087,-3.251908,-3.251908,1.476788,1.476788,-1.561087,-1.179222,45.836571,7.552538,1.476788


In [9]:
#Split Data
df_train = df.iloc[:1460,:]
df_train[TARGET] = Y
df_test = df.iloc[1460:,:]
df_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.067320,-0.042149,-0.202033,197.397238,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-3.158066,22.460888,1.476788,1.476788,-1.561087,-1.296820,45.883611,7.552538,1.476788,208500
1,-0.873466,-0.042149,0.501785,224.444843,-3.080024,-0.042149,-0.042149,-4.598962,-1.561087,-4.598962,...,-3.158066,18.331826,1.476788,1.476788,-1.561087,-1.226261,45.860091,7.552538,1.476788,181500
2,0.067320,-0.042149,-0.061269,263.252276,-3.080024,-4.598962,-0.042149,-4.598962,1.476788,-4.598962,...,-3.158066,25.276158,1.476788,1.476788,-1.561087,-1.132183,45.883611,7.552538,1.476788,223500
3,0.302516,-0.042149,-0.436639,223.268860,-3.080024,-4.598962,-0.042149,-4.598962,-4.598962,-4.598962,...,-3.111145,26.871477,1.476788,1.476788,-1.561087,-1.296820,45.836571,7.552538,-4.598962,140000
4,0.067320,-0.042149,0.689469,334.046442,-3.080024,-4.598962,-0.042149,-4.598962,-1.561087,-4.598962,...,-3.111145,35.974182,1.476788,1.476788,-1.561087,-1.061624,45.883611,7.552538,1.476788,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.067320,-0.042149,-0.342796,184.861261,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-3.158066,18.331826,1.476788,1.476788,-1.561087,-1.155702,45.860091,7.552538,1.476788,175000
1456,-0.873466,-0.042149,0.736390,308.527614,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-3.158066,20.208672,1.476788,1.476788,-1.561087,-1.296820,45.930650,7.552538,1.476788,210000
1457,0.302516,-0.042149,-0.155112,211.320874,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-3.204987,8.572224,1.476788,1.476788,-1.561087,-1.226261,45.930650,7.552538,1.476788,266500
1458,-0.873466,-0.042149,-0.061269,227.196642,-3.080024,-0.042149,-0.042149,-4.598962,1.476788,-4.598962,...,-3.204987,8.009170,1.476788,1.476788,-1.561087,-1.249781,45.930650,7.552538,1.476788,142125


In [10]:
X = df_train.iloc[:,:-1].values
y = np.log(df_train.iloc[:,-1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
y_train, y_test = y_train.astype('float64'), y_test.astype('float64')

In [11]:
algorithm = ['svd', 'eig', 'qr', 'svd-qr', 'svd-jacobi']
MSE=[]
R2_Score=[]
MAE =[]
for alg in algorithm:
    lr = ml.LinearRegression( algorithm = alg)
    lr.fit(X_train,y_train)
    predict = lr.predict(X_test)
    MSE.append(mean_squared_error(y_test,predict))
    R2_Score.append(r2_score(y_test,predict))
    MAE.append(mean_absolute_error(y_test,predict))

In [12]:
res = pd.DataFrame(index = algorithm)
res['R2_Score']=R2_Score
res['MSE']=MSE
res['MAE']=MAE
res

Unnamed: 0,R2_Score,MSE,MAE
svd,0.87853,0.019515,0.099939
eig,0.87853,0.019515,0.099939
qr,0.87853,0.019515,0.099939
svd-qr,-0.061305,0.170507,0.324293
svd-jacobi,-0.061305,0.170507,0.324293


In [13]:
#Best Model is Linear Regression-SVD
reg = ml.LinearRegression( algorithm = 'svd')
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
print('R2_Score: ',r2_score(y_test,y_pred),'\tMAE: ',mean_absolute_error(y_test,y_pred),'\tMSE: ',mean_squared_error(y_test,y_pred))

R2_Score:  -0.061305124684885914 	MAE:  0.32429276404785695 	MSE:  0.17050735817739998


In [14]:
#Prediciton on Test Data
ids=pd.Series(df_test.index+1)
sub = pd.DataFrame()
sub[ID] = ids
sub[TARGET] = reg.predict(df_test)
sub[TARGET]=np.exp(sub[TARGET],dtype='float64')
sub.to_csv(SUBMISSION_PATH,index=False)
sub

Unnamed: 0,Id,SalePrice
0,1461,168060.192639
1,1462,162184.183244
2,1463,162313.159045
3,1464,159705.584143
4,1465,164857.729745
...,...,...
1454,2915,182532.749092
1455,2916,178561.316117
1456,2917,168602.542119
1457,2918,201919.679635
