In [1]:
#installing rapids
import sys
!cp ../input/rapids-library/rapids.21.06/opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [3]:

import cudf as pd
import cupy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import cuml

from cuml.model_selection import train_test_split
from cuml.preprocessing import StandardScaler
from cuml.metrics import r2_score, mean_absolute_error, mean_squared_error

from cuml.linear_model import LinearRegression

The test and the training datasets are loaded

In [41]:
train_df = pd.read_csv(r'../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv(r'../input/house-prices-advanced-regression-techniques/test.csv')

In [42]:
print(f'train df shape is {train_df.shape}')
print(f'test df shape is {test_df.shape}')

In [43]:
train_df.info()

**Missing data in data set being handled**

In [44]:
missing_df = train_df.isnull().sum() / len(train_df)
train_drop=missing_df[missing_df > 0.40]

missing_df_2 = test_df.isnull().sum() / len(test_df)
test_drop=missing_df_2[missing_df_2 > 0.40]


train_df.drop(columns = train_drop.keys().to_array(),axis=1,inplace=True)
test_df.drop(columns = train_drop.keys().to_array(),axis=1,inplace=True)

In [45]:
train_df.isnull().sum()

In [46]:
train_df.reset_index(drop=True,inplace=True)
test_df.drop(columns='Id',axis=1,inplace=True)

**Counting number of numerical features**

In [47]:
numerical_features = [feature for feature in train_df.columns if train_df[feature].dtype != 'O']
print(f'Number of Numerical Features are {len(numerical_features)}')

**Counting number of categorical features**

In [48]:
categorical_features = [feature for feature in train_df.columns if train_df[feature].dtype == 'O']
print(f'Number of Categorical Features are {len(categorical_features)}')

In [49]:
year_features = [feature for feature in numerical_features if 'Year' in feature or 'Yr' in feature ]
year_features

In [50]:
discrete_features = [feature for feature in numerical_features if len(train_df[feature].unique()) < 25 and feature not in year_features]
print("Discrete Variables Count: ",len(discrete_features))

In [51]:
continuous_features =  [feature for feature in numerical_features if feature not in discrete_features and feature not in year_features]
print("Continuous Variables Count: ",len(continuous_features))


In [15]:
train_df[discrete_features].head()

In [16]:
train_df.info()

In [17]:
train_df[categorical_features].head()

In [18]:
for feature in categorical_features:
    data = train_df.copy()
    print(f'The feature is {feature} and no of categories are {len(data[feature].unique())}')
    

In [19]:

categorical_features_nan = [feature for feature in train_df.columns if train_df[feature].isnull().sum() > 0 and train_df[feature].dtype == 'O']


for feature in categorical_features_nan:
    print(f"{feature}: {np.round(train_df[feature].isnull().mean(),4)}% missing values")

In [20]:
def replace_missing_nan_cat(dataset,features):
    data = dataset.copy()
    data[features] = data[features].fillna('Missing')
    return data

In [21]:
train_df = replace_missing_nan_cat(train_df,categorical_features)
test_df = replace_missing_nan_cat(test_df,categorical_features)

In [22]:
train_df[categorical_features].head(100)

In [23]:
for feature in categorical_features_nan:
    print(f"{feature}: {np.round(train_df[feature].isnull().mean(),4)}% missing values")

In [24]:

numerical_features_nan = [feature for feature in train_df.columns if train_df[feature].isnull().sum() > 0 and train_df[feature].dtype != 'O']
numerical_features_nan

for feature in numerical_features_nan:
    train_df[feature] = train_df[feature].fillna(train_df[feature].median())

In [25]:
print(train_df[numerical_features_nan].isnull().sum())

In [26]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    train_df[feature] = train_df['YrSold'] - train_df[feature]
    test_df[feature] = test_df['YrSold'] - test_df[feature]

In [27]:
train_df[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

In [28]:
num_continuous_features_log=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']
for feature in num_continuous_features_log:
    train_df[feature] = np.log(train_df[feature])


In [29]:
train_df[categorical_features].head(10)

In [30]:
train_df['MSZoning'].unique()

In [31]:
from cuml.preprocessing import MinMaxScaler
scale=MinMaxScaler()
train_df[numerical_features]=scale.fit_transform(train_df[numerical_features])

In [32]:
from cuml.preprocessing import LabelEncoder
enc = LabelEncoder()
for feature in categorical_features:
    train_df[feature] = enc.fit_transform(train_df[feature])
    test_df[feature] = enc.fit_transform(test_df[feature])
train_df[categorical_features].head()

In [33]:

train_df = train_df[["OverallQual","YearBuilt","YearRemodAdd","ExterQual","TotalBsmtSF","1stFlrSF","GrLivArea","FullBath","TotRmsAbvGrd","GarageCars","GarageArea",
                   "MSZoning", "Utilities","BldgType","Heating","KitchenQual","SaleCondition","LandSlope","SalePrice"]]
test_df_X = test_df[["OverallQual","YearBuilt","YearRemodAdd","ExterQual","TotalBsmtSF","1stFlrSF","GrLivArea","FullBath","TotRmsAbvGrd","GarageCars","GarageArea",
                   "MSZoning", "Utilities","BldgType","Heating","KitchenQual","SaleCondition","LandSlope"]]
len(train_df.columns)

In [34]:
from sklearn.model_selection import train_test_split
X = train_df.drop('SalePrice',axis=1)
Y = train_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)


In [35]:
X_train.head()

In [36]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [37]:
import cuml
from cuml import LinearRegression

In [38]:
lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = 'eig')
reg = lr.fit(X_train,y_train)
print("Coefficients:")
print(reg.coef_)
print("Intercept:")
print(reg.intercept_)
preds = lr.predict(X_test)
print("Predictions:")
print(preds)
print("MSE:")
print(cuml.metrics.regression.mean_squared_error(y_test,preds))
print("R2 Score:")
print(cuml.metrics.regression.r2_score(y_test,preds))
print("MAE:")
print(cuml.metrics.regression.mean_absolute_error(y_test,preds))

In [39]:
import pandas
models = pd.DataFrame(columns=["Algorithm","MAE","MSE","R2 Score"])
algorithm = ['svd', 'eig', 'qr', 'svd-qr', 'svd-jacobi']
for i in algorithm:
    lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = i)
    reg = lr.fit(X_train,y_train)
    preds=lr.predict(X_test)
    MSE=cuml.metrics.regression.mean_squared_error(y_test,preds)
    R2_Score=cuml.metrics.regression.r2_score(y_test,preds)
    MAE=cuml.metrics.regression.mean_absolute_error(y_test,preds)
    new_row = {"Algorithm": i,"MAE": MAE, "MSE": MSE,"R2 Score": R2_Score}
    models = models.append(new_row, ignore_index=True)
models.set_index("Algorithm",inplace=True)
model=models.to_pandas()
model.plot.bar()
plt.show()
