In [9]:
import cupy as np 
import cudf as pd
from math import sqrt
from scipy.stats import skew
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
import cuml
from cuml import LinearRegression
from sklearn import linear_model
from sklearn import preprocessing

In [10]:
plt.style.use(style='fivethirtyeight')
plt.rcParams['figure.figsize'] = (10, 6)

**Exploratory Data Analysis**

In this initial investigations on data will be performed to to develop an understanding of the data, discover patterns and spot anomalies.

In [11]:
# load the datasets into dataframe
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [12]:
# show the first few records of train set
train.head()

In [13]:
# check the number of records and columns in both of datasets
print('No. of records in train dataset: ', len(train.index))
print('No. of columns in train dataset: ', len(train.columns))
print('No. of records in test dataset: ', len(test.index))
print('No. of columns in test dataset: ', len(test.columns))

In [14]:
# check the missing values
print ('Total missing values in train set', sum(train.to_pandas().isna().sum()))
print ('Total missing values in test set', sum(test.to_pandas().isna().sum()))

In [15]:

train['SalePrice'].describe()

The above line code shows that the average sale price of a house is close to 180,000 with most of the values falling within the 130,000 to 215,000 range. Next step is to show the relationship between the columns to examine the correlations between the features and the target.

In [16]:
numeric_cols = train.select_dtypes(include = [np.number])
corr = numeric_cols.to_pandas().corr()
print ('The Most Correlated Features with SalePrice:'), print (corr['SalePrice'].sort_values(ascending = False)[:10], '\n')
print ('The Most Uncorrelated Features with SalePrice:'), print (corr['SalePrice'].sort_values(ascending = False)[-5:])

The most correlated features to sale price were the overall quality score (79%), above-ground living area (71%), garage area (64%), and number-of-car garage (62%). Next step is to plot each variable individually against SalePrice in a scatter plot to check outliers as outliers can affect the regression model by pulling the estimated regression line further away from the true population regression line.

In [17]:
plt.scatter(x = train['GrLivArea'].to_array(), y = train['SalePrice'].to_array())
plt.ylabel('SalePrice')
plt.xlabel('GrLivArea (Above grade "ground" living area square feet)')

At first glance, there are increases in living area correspond to increases in price, with few outliers.

In [18]:

plt.scatter(x = train['GarageArea'].to_array(), y = train['SalePrice'].to_array())
plt.ylabel('SalePrice')
plt.xlabel('GarageArea')

So there are many homes with 0 for GarageArea and there are a few outliers as well!

**Data Preprocessing**

In this section the data is prepared (transformed, encoded, etc) to make it suitable for a building and training machine learning model. I chose to manually remove certain extreme outliers in the dataset to produce a better fit.

In [19]:
# remove GrLivArea outliers
train = train[train['GrLivArea'] < 4500]

In [20]:
# remove GarageArea outliers
train = train[train['GarageArea'] < 1200]

In [21]:
# drop columns with percentage of missing values > 80%
train_percentage = train.isnull().sum() / train.shape[0]
print (train_percentage[train_percentage > 0.80])
index=train_percentage[train_percentage > 0.80].index
#print(index)
train = train.drop(index.to_array(), axis = 1)

In [22]:
# do the same with test data
test_percentage = test.isnull().sum() / test.shape[0]
print (test_percentage[test_percentage > 0.80])
test = test.drop(test_percentage[test_percentage > 0.80].index.to_array(), axis = 1)

In [23]:
# encode categorical variables
le = preprocessing.LabelEncoder()
train=train.to_pandas()
for name in train.columns:
    if train[name].dtypes == 'O':
        train[name] = train[name].astype(str)
        le.fit(train[name])
        train[name] = le.transform(train[name])

In [24]:
# do the same for testset
test=test.to_pandas()
for name in test.columns:
    if test[name].dtypes == 'O':
        test[name] = test[name].astype(str)
        le.fit(test[name])
        test[name] = le.transform(test[name])

There are many ways to handle NaN values, whether to fill with the mean or median, however strings cannot be averaged or median-ed. One way to fill missing values is to impute these missing values according to their probability of occuring in the dataset to avoid single-valued imputation that impacts the quality of inference and prediction.

In [25]:
# fill missing values based on probability of occurrence
for column in train.columns:
    null_vals = train.isnull().values
    a, b = np.unique(train.values[~null_vals], return_counts = 1)
    random_choice=np.random.choice(a, train[column].isnull().sum(), p = b / b.sum());
    random_choice=np.asnumpy(random_choice)
    cna=train[column].isna().to_numpy()
    train.loc[cna, column] = random_choice

In [26]:
# apply log transformation to reduce skewness over .75 by taking log(feature + 1)
skewed_train = train.apply(lambda x: skew(x.dropna()))
skewed_train = skewed_train[skewed_train > .75]
sti=skewed_train.index
tsti=train[skewed_train.index]
train[sti] = np.log1p(np.asarray(tsti.to_numpy()))

In [27]:
# deal with the skewness in the test data
skewed_test = test.apply(lambda x: skew(x.dropna()))
skewed_test = skewed_test[skewed_test > .75]
var_ti=test[skewed_test.index]
ans=np.log1p(np.asarray(var_ti.to_numpy()))
#print(type(ans))
test[skewed_test.index] = ans

**Modelling**

I will perform a simple linear regression on the dataset to predict house prices. In order to train out the regression model, we need to first split up the data into an X list that contains the features to train on, and a y list with the target variable, in this case, the Price column.

In [28]:
X = train.drop(['SalePrice', 'Id'], axis = 1)
y = train['SalePrice'].to_frame()
y=y.to_numpy()
y=y.reshape(-1,1)

In [29]:
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X.to_numpy())

In [30]:
X_scaled

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [32]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [33]:
lr = LinearRegression(fit_intercept = True, normalize = False,
                      algorithm = "eig")

In [34]:
reg = lr.fit(X_train,y_train)

In [35]:
# make predictions based on model
preds = lr.predict(X_test)

In [36]:
mse_e=mean_squared_error(y_test,preds)
rs_e=r2_score(y_test,preds)
mae_e=mean_absolute_error(y_test,preds)

In [37]:
# alpha helps to show overlapping data
plt.scatter(preds, y_test, alpha = 0.7, color = 'b')
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('Linear Regression Model eig')

In [38]:
lrs=LinearRegression(fit_intercept = True, normalize = False,
                      algorithm = "svd")

In [39]:
regs = lrs.fit(X_train,y_train)

In [40]:
predss = lrs.predict(X_test)

In [41]:
mse_s=mean_squared_error(y_test,predss)
rs_s=r2_score(y_test,predss)
mae_s=mean_absolute_error(y_test,predss)

In [42]:
# alpha helps to show overlapping data
plt.scatter(preds, y_test, alpha = 0.7, color = 'b')
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('Linear Regression Model svd')

In [43]:
lrsq=LinearRegression(fit_intercept = True, normalize = False,
                      algorithm = "qr")

In [44]:
regsq = lrsq.fit(X_train,y_train)

In [45]:
predsq = lrsq.predict(X_test)

In [46]:
mse_q=mean_squared_error(y_test,predsq)
rs_q=r2_score(y_test,predsq)
mae_q=mean_absolute_error(y_test,predsq)

In [47]:
# alpha helps to show overlapping data
plt.scatter(preds, y_test, alpha = 0.7, color = 'b')
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('Linear Regression Model qr')

In [48]:
lrssq=LinearRegression(fit_intercept = True, normalize = False,
                      algorithm = "svd-qr")

In [49]:
regssq = lrssq.fit(X_train,y_train)

In [50]:
predssq = lrssq.predict(X_test)

In [51]:
mse_sq=mean_squared_error(y_test,predssq)
rs_sq=r2_score(y_test,predssq)
mae_sq=mean_absolute_error(y_test,predssq)

In [52]:
# alpha helps to show overlapping data
plt.scatter(preds, y_test, alpha = 0.7, color = 'b')
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('Linear Regression Model svd-qr')

In [53]:
lrssj=LinearRegression(fit_intercept = True, normalize = False,
                      algorithm = "svd-jacobi")

In [54]:
regssj = lrssj.fit(X_train,y_train)

In [55]:
predssj = lrssj.predict(X_test)

In [56]:
mse_sj=mean_squared_error(y_test,predssj)
rs_sj=r2_score(y_test,predssj)
mae_sj=mean_absolute_error(y_test,predssj)

In [57]:
# alpha helps to show overlapping data
plt.scatter(preds, y_test, alpha = 0.7, color = 'b')
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('Linear Regression Model svd-jacobi')

In [58]:
df = pd.DataFrame()
df['key'] = ['eig','svd','qr','svd-qr','svd-jacobi']
df['mse'] = [mse_e,mse_s,mse_q,mse_sq,mse_sj]
df['r2s'] = [rs_e,rs_s,rs_q,rs_sq,rs_sj]
df['mae'] = [mae_e,mae_s,mae_q,mae_sq,mae_sj]

In [59]:
df

**Submission**

In [60]:
submission = pd.DataFrame()
submission['Id'] = test['Id'].astype(int)

In [61]:
temp = test.drop(['Id'], axis = 1).interpolate()

In [62]:
predictions = reg.predict(temp)

In [63]:
predictions = np.exp(np.asarray(predictions))
submission['SalePrice'] = predictions

In [64]:
submission.to_csv('submission.csv', index = False)