In [58]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [59]:
#GPU libraries
import cudf as pd
import cupy as cp
import cuml as np
from cuml import LinearRegression
from cuml.linear_model import LinearRegression
from cuml import Ridge
from cuml.linear_model import Ridge
from cuml.model_selection import train_test_split
from cuml.linear_model import Lasso
from cuml.ensemble import RandomForestRegressor

In [60]:
#Evaluation Metrics
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, Normalizer, StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
import sklearn_pandas
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,train_test_split
from scipy import stats
# from sklearn.linear_model import LinearRegression
from scipy.special import boxcox1p
import csv



In [61]:
#Environment Specs
import sys
import scipy

print('Environment specification:\n')
print('python', '%s.%s.%s' % sys.version_info[:3])

for mod in np, scipy, sns, sklearn, pd:
    print(mod.__name__, mod.__version__)

### Reading CSV file as Data frame

In [62]:
# kaggle train data
data_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
data_df

In [63]:
#kaggle test data 
test_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
test_df

In [64]:
data_df.info()

In [65]:
test_df.info()

In [66]:
data_df.describe()

There are 80 columns in train data and 79 columns in test data. We need to predict Sale Price using Linear regression  and submit the predicted values in sample_submission.csv and upload it on kaggle.

### Exploratory Data Analysis 

In [67]:
#summary
data_df['SalePrice'].describe()

In [68]:
#making boxplot by first using to_pandas else it gives error
temp = data_df
temp = temp.to_pandas()
sns.boxplot(temp['SalePrice'])

Two outliers with price more than 700000

In [69]:
#density plot 
sns.distplot(temp['SalePrice'])

I noticed that it is right-skewed distribution with the pick around 160k and quite long tail with maximum about 800k

I now check the fraction of Nan values in each column 

In [70]:
import matplotlib.pyplot as plt

col_nan = temp.isna().sum() / temp.shape[0]
plt.figure(figsize=(8, 5))
sns.set(font_scale=1.2)
col_nan[col_nan > 0.01].plot(kind = "barh")
plt.title("Features with the highest percentage of Nan values")


In [71]:
## Top 35 Fields of test data having null values
test_df.isnull().sum().sort_values(ascending=False).iloc[:35]

In [72]:
data_df.isnull().sum().sort_values(ascending=False).iloc[:20]

As we have large data then, i can probably delete the rows having null values but remember a side note that if some class to predict is occuring quite less, then instead of deleting that row try to replace the null values with mean 

In [73]:
## Top 5 Fields of test data having null values
print(test_df.isnull().sum().sort_values(ascending=False).iloc[:5])
## Top 5 Fields of test data having null values
print(data_df.isnull().sum().sort_values(ascending=False).iloc[:5])

In [74]:
#Here we are getting a lot of features with a lot of missing values 
data_df.isnull().sum()

In [75]:
num_train = data_df.shape[0]
print("No. of rows of train_df= " ,num_train)

In [76]:
num_test = test_df.shape[0]
print("No. of rows test_df = ",num_test)

In [77]:
print("Columns of train_df =",data_df.columns)

In [78]:
Y = data_df['SalePrice']
#separating the prediction column 


In [79]:
# and dropping it from data_df
data_df.drop('SalePrice', axis = 1 , inplace = True)

In [80]:
data_df.columns

In [81]:
#calculating the number of categorical features and numerical fatures
numerical_features = data_df.dtypes[data_df.dtypes != 'object'].index
print(numerical_features)
print("Number of numerical_features = " , len(numerical_features))

In [82]:
categorical_features = data_df.dtypes[data_df.dtypes == 'object'].index
print(categorical_features)
print("Number of categorical_features = " , len(categorical_features))

### Handling NULL values

filling NA values with mean of the data in the column

In [83]:
# data_df['LotFrontage'].value_counts()

In [84]:
data_df['LotFrontage']= data_df['LotFrontage'].fillna(data_df['LotFrontage'].mean())
data_df['LotFrontage'].value_counts()

In [85]:
test_df['LotFrontage']= test_df['LotFrontage'].fillna(test_df['LotFrontage'].mean())
test_df['LotFrontage'].value_counts()

In [86]:
#Handling Categorical Features for Training Data
data_df['BsmtCond']=data_df['BsmtCond'].fillna(data_df['BsmtCond'].mode()[0])
data_df['BsmtQual']=data_df['BsmtQual'].fillna(data_df['BsmtQual'].mode()[0])
data_df['FireplaceQu']=data_df['FireplaceQu'].fillna(data_df['FireplaceQu'].mode()[0])
data_df['GarageType']=data_df['GarageType'].fillna(data_df['GarageType'].mode()[0])
data_df['GarageFinish']=data_df['GarageFinish'].fillna(data_df['GarageFinish'].mode()[0])
data_df['GarageQual']=data_df['GarageQual'].fillna(data_df['GarageQual'].mode()[0])
data_df['GarageCond']=data_df['GarageCond'].fillna(data_df['GarageCond'].mode()[0])
data_df['MasVnrType']=data_df['MasVnrType'].fillna(data_df['MasVnrType'].mode()[0])
data_df['MasVnrArea']=data_df['MasVnrArea'].fillna(data_df['MasVnrArea'].mode()[0])
data_df['BsmtExposure']=data_df['BsmtExposure'].fillna(data_df['BsmtExposure'].mode()[0])
data_df['BsmtFinType2']=data_df['BsmtFinType2'].fillna(data_df['BsmtFinType2'].mode()[0])
data_df['Utilities']=data_df['Utilities'].fillna(data_df['Utilities'].mode()[0])
data_df['Exterior1st']=data_df['Exterior1st'].fillna(data_df['Exterior1st'].mode()[0])
data_df['Exterior2nd']=data_df['Exterior2nd'].fillna(data_df['Exterior2nd'].mode()[0])
data_df['BsmtFinType1']=data_df['BsmtFinType1'].fillna(data_df['BsmtFinType1'].mode()[0])
data_df['SaleType']=data_df['SaleType'].fillna(data_df['SaleType'].mode()[0])
data_df['BsmtFullBath']=data_df['BsmtFullBath'].fillna(data_df['BsmtFullBath'].mode()[0])
data_df['BsmtHalfBath']=data_df['BsmtHalfBath'].fillna(data_df['BsmtHalfBath'].mode()[0])
data_df['KitchenQual']=data_df['KitchenQual'].fillna(data_df['KitchenQual'].mode()[0])
data_df['Functional']=data_df['Functional'].fillna(data_df['Functional'].mode()[0])
data_df['GarageCars']=data_df['GarageCars'].fillna(data_df['GarageCars'].mean())
data_df['GarageArea']=data_df['GarageArea'].fillna(data_df['GarageArea'].mean())
data_df['BsmtFinSF1']=data_df['BsmtFinSF1'].fillna(data_df['BsmtFinSF1'].mean())
data_df['BsmtFinSF2']=data_df['BsmtFinSF2'].fillna(data_df['BsmtFinSF2'].mean())
data_df['BsmtUnfSF']=data_df['BsmtUnfSF'].fillna(data_df['BsmtUnfSF'].mean())
data_df['TotalBsmtSF']=data_df['TotalBsmtSF'].fillna(data_df['TotalBsmtSF'].mean())

In [87]:
#Handling Categorical Features for Test Data
test_df['BsmtCond']=test_df['BsmtCond'].fillna(test_df['BsmtCond'].mode()[0])
test_df['BsmtQual']=test_df['BsmtQual'].fillna(test_df['BsmtQual'].mode()[0])
test_df['FireplaceQu']=test_df['FireplaceQu'].fillna(test_df['FireplaceQu'].mode()[0])
test_df['GarageType']=test_df['GarageType'].fillna(test_df['GarageType'].mode()[0])
test_df['GarageFinish']=test_df['GarageFinish'].fillna(test_df['GarageFinish'].mode()[0])
test_df['GarageQual']=test_df['GarageQual'].fillna(test_df['GarageQual'].mode()[0])
test_df['GarageCond']=test_df['GarageCond'].fillna(test_df['GarageCond'].mode()[0])
test_df['MasVnrType']=test_df['MasVnrType'].fillna(test_df['MasVnrType'].mode()[0])
test_df['MasVnrArea']=test_df['MasVnrArea'].fillna(test_df['MasVnrArea'].mode()[0])
test_df['BsmtExposure']=test_df['BsmtExposure'].fillna(test_df['BsmtExposure'].mode()[0])
test_df['BsmtFinType2']=test_df['BsmtFinType2'].fillna(test_df['BsmtFinType2'].mode()[0])
test_df['MSZoning']=test_df['MSZoning'].fillna(test_df['MSZoning'].mode()[0])
test_df['Utilities']=test_df['Utilities'].fillna(test_df['Utilities'].mode()[0])
test_df['Exterior1st']=test_df['Exterior1st'].fillna(test_df['Exterior1st'].mode()[0])
test_df['Exterior2nd']=test_df['Exterior2nd'].fillna(test_df['Exterior2nd'].mode()[0])
test_df['BsmtFinType1']=test_df['BsmtFinType1'].fillna(test_df['BsmtFinType1'].mode()[0])
test_df['SaleType']=test_df['SaleType'].fillna(test_df['SaleType'].mode()[0])
test_df['BsmtFullBath']=test_df['BsmtFullBath'].fillna(test_df['BsmtFullBath'].mode()[0])
test_df['BsmtHalfBath']=test_df['BsmtHalfBath'].fillna(test_df['BsmtHalfBath'].mode()[0])
test_df['KitchenQual']=test_df['KitchenQual'].fillna(test_df['KitchenQual'].mode()[0])
test_df['Functional']=test_df['Functional'].fillna(test_df['Functional'].mode()[0])
test_df['GarageCars']=test_df['GarageCars'].fillna(test_df['GarageCars'].mean())
test_df['GarageArea']=test_df['GarageArea'].fillna(test_df['GarageArea'].mean())
test_df['BsmtFinSF1']=test_df['BsmtFinSF1'].fillna(test_df['BsmtFinSF1'].mean())
test_df['BsmtFinSF2']=test_df['BsmtFinSF2'].fillna(test_df['BsmtFinSF2'].mean())
test_df['BsmtUnfSF']=test_df['BsmtUnfSF'].fillna(test_df['BsmtUnfSF'].mean())
test_df['TotalBsmtSF']=test_df['TotalBsmtSF'].fillna(test_df['TotalBsmtSF'].mean())

In [88]:
data_df

In [89]:
test_df

In [90]:
data_df.isnull().sum()

In [91]:
test_df.isnull().sum()

Dropping values less than 50 %

In [92]:
data_df.drop(['Alley' , 'PoolQC' , 'Fence' , 'MiscFeature' , 'GarageYrBlt' , 'Id'] , axis = 1 , inplace = True)

In [93]:
test_df.drop(['Alley' , 'PoolQC' , 'Fence' , 'MiscFeature' , 'GarageYrBlt' , 'Id'] , axis = 1 , inplace = True)

In [94]:
#visualizing again after drawing the heat map
train1 = data_df
train1 = train1.to_pandas()
sns.heatmap(train1.isnull() )

In [95]:
data_df.isnull().any().any

In [96]:
test_df.isnull().any().any

In [97]:
# def remove_outliers(dataset, threshold, columns=None, removed = False):
#     """ 
#     Z-score method.
#     Function returns a dataframe without rows labeled as 'outliers' according to the given threshold.  
#     ---------------
#     If columns = None, transform all numerical columns.
#     If removed = True, return also dataframe with removed rows.
#     """
#     if columns==None:
#         numerics = ['int64','float64']
#         columns = dataset.select_dtypes(include=numerics).columns
    
#     tmp = dataset.copy()
#     z = np.abs(stats.zscore(tmp[columns]))
#     outliers = [row.any() for row in (z > threshold)]  
#     outliers_idxs = tmp.index[outliers].tolist()
#     print("Number of removed rows = {}".format(len(outliers_idxs)))
#     if removed: return dataset.drop(outliers_idxs), tmp.loc[outliers]
#     else: return dataset.drop(outliers_idxs)
#removing outliers

In [98]:
final_df = pd.concat([data_df , test_df] , axis = 0)

In [99]:
final_df

In [100]:
final_df.shape

In [101]:
# pandas.get_dummies() is used for data manipulation. It converts categorical data into dummy or indicator variables.
final_df = pd.get_dummies(final_df)

In [102]:
final_df.shape

In [103]:
#splitting data into train , test 
train_df = final_df.iloc[:num_train , :]
test_df = final_df.iloc[num_test+1 :, :]

In [104]:
X= train_df
X.shape

In [105]:
Y.shape

### Model Creation - 
#### Generate a comparison of MSE, MAE, R2-score for all the five Linear Regression algorithms based on cuML - Linear Regression library for doing house price prediction.

In [106]:
# Splitting the data to train, test 
X_train , X_test , y_train , y_test = train_test_split(X , Y , test_size=0.29 , random_state = 42)

In [107]:
X.isnull().any().any

LinearRegression is a simple machine learning model where the response y is modelled by a linear combination of the predictors in X.

cuML’s LinearRegression expects either a cuDF DataFrame or a NumPy matrix and provides 2 algorithms SVD and Eig to fit a linear model. SVD is more stable, but Eig (default) is much faster.

In [108]:
# Creating the model and fitting it 
linreg = LinearRegression(fit_intercept = True , normalize = False , algorithm = 'svd-jacobi')

In [109]:
split = linreg.fit(X,Y)

In [110]:
print("R-Squared Value for Training Set: {:.3f}".format(linreg.score(X_train.astype('float32'), y_train.astype('float32'))))


In [111]:
test_df.shape

In [112]:
#Predictions
y_pred_split=linreg.predict(test_df)
print(y_pred_split)
print(y_pred_split.shape)

algorithm{‘svd’, ‘eig’, qr, ‘svd-qr’, ‘svd-jacobi’}, (default = ‘eig’)
Choose an algorithm:

‘svd’ - alias for svd-jacobi;

‘eig’ - use an eigendecomposition of the covariance matrix;

‘qr’ - use QR decomposition algorithm and solve Rx = Q^T y

‘svd-qr’ - compute SVD decomposition using QR algorithm

‘svd-jacobi’ - compute SVD decomposition using Jacobi iterations.

Among these algorithms, only ‘svd-jacobi’ supports the case when the number of features is larger than the sample size; this algorithm is force-selected automatically in such a case.

For the broad range of inputs, ‘eig’ and qr are usually the fastest, followed by ‘svd-jacobi’ and then ‘svd-qr’. In theory, SVD-based algorithms are more stable.

In [113]:
# making a list of the algorithms 
algorithm = ['svd', 'eig', 'svd-qr', 'svd-jacobi']

In [114]:
import cuml
#Evaluation Metrics for all 5 algorithms
for _ in algorithm:
    print("Algorithm used by me : " ,_)
    lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = _)
    reg = lr.fit(X_train,y_train)
    preds = lr.predict(X_test)
    print("Mean Squared Error is :")
    print(cuml.metrics.regression.mean_squared_error(y_test.astype('int64'),preds.astype('int64')))
    print("R2 Score is :")
    print(cuml.metrics.regression.r2_score(y_test.astype('float32'),preds.astype('float32')))
    print("Mean Absolute Error is :")
    print(cuml.metrics.regression.mean_absolute_error(y_test.astype('int64'),preds.astype('int64')))
    print("\n")

While using the algorithm 'qr' (QR Decomposition) algorithm i recived an error saying "Error: cuDF Series has missing/null values, which are not supported by cuML."

But as you can see above i checked multiple times for finding the missing values in the training and test data but found nothing 

In [115]:
sample_sub = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_sub.head()

In [116]:
#the column where we making predictions and result is checked 
sample_sub['SalePrice']

In [117]:
#updating 
sample_sub['SalePrice'] = y_pred_split

In [118]:
#checking 
sample_sub['SalePrice']

In [119]:
sample_sub.to_csv('submission1.csv' , index = False)

### For Comptition
Applying another models 

Ridge Regression

In [120]:
import cupy as cp
alpha = cp.array([1e-5])
ridge1 = Ridge(alpha = alpha, fit_intercept = True, normalize = False,solver = "eig")
model1=ridge1.fit(X,Y)
y_pred_ridge1=model1.predict(test_df)

In [121]:
y_pred_ridge1

In [122]:
alpha = cp.array([1e-5])
ridge2 = Ridge(alpha = alpha, fit_intercept = True, normalize = False,
solver = "svd")
model2=ridge2.fit(X,Y)
y_pred_ridge2=model2.predict(test_df)

In [123]:
y_pred_ridge2

In [124]:
#submission
sample_sub['SalePrice'] = y_pred_ridge1

sample_sub.to_csv('submission2.csv', index=False)

In [125]:
#Submission
sample_sub['SalePrice'] = y_pred_ridge2

sample_sub.to_csv('submission3.csv', index=False)

Lasso Rergression

In [126]:
ls = Lasso(alpha = 0.1)
result_lasso = ls.fit(X, Y)
y_pred_lasso=result_lasso.predict(test_df)

In [127]:
y_pred_lasso

In [128]:
    #Submission
sample_sub['SalePrice'] = y_pred_lasso
sample_sub.to_csv('submission4.csv', index=False)

random Forrest 

In [129]:
forest_regressor = RandomForestRegressor(n_estimators = 250)
forest_regressor.fit(X.values.astype('float32'), Y.astype('float32'))

In [130]:
y_pred_random= forest_regressor.predict(test_df)

In [131]:
y_pred_random

In [132]:
print("R-Squared Value for Training Set: {:.3f}".format(forest_regressor.score(X.astype('float32'), Y.astype('float32'))))

In [133]:
sample_sub['SalePrice'] = y_pred_random
sample_sub.to_csv('submission5.csv', index=False)