## House-Prices-Advanced-Regression-Techniques

### Made By: Gaurav Baweja

In [1]:
import cudf as pd
import cupy as np
import cuml as ml

In [2]:
TRAIN_PATH = "../input/house-prices-advanced-regression-techniques/train.csv"
TEST_PATH = "../input/house-prices-advanced-regression-techniques/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/house-prices-advanced-regression-techniques/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"
ID = "Id"
TARGET = "SalePrice"
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
def checkNull_fillData(df):
    for col in df.columns:
        if len(df.loc[df[col].isnull() == True]) != 0:
            if df[col].dtype == "float64" or df[col].dtype == "int64":
                df.loc[df[col].isnull() == True,col] = df[col].median()
            else:
                df.loc[df[col].isnull() == True,col] = "Missing"
                
checkNull_fillData(train)
checkNull_fillData(test)

import pandas as pdf

def encode_data(df):
    df_typ = pdf.DataFrame(df.dtypes, columns=["types"])
    df_typ_obj = df_typ[df_typ["types"] == "object"]
    for fid_obj in df_typ_obj.index:
        df[fid_obj] = df[fid_obj].astype('category').cat.codes

encode_data(train)
encode_data(test)
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,1,3,3,0,...,0,3,2,1,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,1,3,3,0,...,0,3,2,1,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,1,0,3,0,...,0,3,2,1,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,1,0,3,0,...,0,3,2,1,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,1,0,3,0,...,0,3,2,1,0,12,2008,8,4,250000


In [4]:

train[TARGET] = train[TARGET].astype('float64')
X = train.drop([ID,TARGET],axis=1)
y = train[TARGET]
sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
test1 = test.drop([ID],axis=1)

In [5]:
from cuml.preprocessing import StandardScaler
scaler=StandardScaler()
x_scaled = scaler.fit_transform(X)

In [6]:
from cuml.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x_scaled, y, test_size = 0.3, random_state = 41)

In [7]:
from cuml.linear_model import LinearRegression
lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = 'eig')
reg = lr.fit(X_train,Y_train)


In [8]:
pred = reg.predict(X_test)
print("MSE: ")
print(ml.metrics.regression.mean_squared_error(Y_test,pred))
print("R2 Score: ")
print(ml.metrics.regression.r2_score(Y_test,pred))
print("MAE: ")
print(ml.metrics.regression.mean_absolute_error(Y_test,pred))

MSE: 
1231139349.1998127
R2 Score: 
0.7859036844316702
MAE: 
20570.35733393687


In [9]:
algorithm = ['svd', 'eig', 'qr', 'svd-qr', 'svd-jacobi']
MSE=[]
R2_Score=[]
MAE =[]
for alg in algorithm:
    lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = alg)
    reg = lr.fit(X_train,Y_train)
    predict = lr.predict(X_test)
    MSE.append(ml.metrics.regression.mean_squared_error(Y_test,predict))
    R2_Score.append(ml.metrics.regression.r2_score(Y_test,predict))
    MAE.append(ml.metrics.regression.mean_absolute_error(Y_test,predict))

In [10]:
res = pd.DataFrame(index = algorithm)
res['MSE']=MSE
res['R2_Score']=R2_Score
res['MAE']=MAE
res

Unnamed: 0,MSE,R2_Score,MAE
svd,1231139000.0,0.7859037,20570.36
eig,1231139000.0,0.7859037,20570.36
qr,1230588000.0,0.7859996,20878.84
svd-qr,2.876019e+34,-5.001424e+24,1.176681e+17
svd-jacobi,2.876019e+34,-5.001424e+24,1.176681e+17


In [11]:
#Prediciton on Test Data
sub[TARGET] = reg.predict(test1)
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,-1.14008e+21
1,1462,-1.360699e+21
2,1463,-1.239479e+21
3,1464,-8.820103e+20
4,1465,-4.498361e+20
