In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
#
#from sklearn.compose import ColumnTransformer
#from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

#Load the training data
X = pd.read_csv('/content/drive/My Drive/Kaggle/train.csv', index_col= 'Id')
y = X['SalePrice']
X.drop(['SalePrice'], axis= 1, inplace=True)
#Load the test data
X_test = pd.read_csv('/content/drive/My Drive/Kaggle/test.csv', index_col='Id')

# Shape of the data
print("The shape pf training data is ", X.shape)
print("The shape of test is ", X_test.shape)
# Count of numeric and categorical columns
object_count = sum([1 for i in X.columns if X[i].dtype == 'object'])
num_count = sum([1 for i in X.columns if X[i].dtype != 'object'])
# Numeric and Categorical columns
num_cols = [i for i in X.columns if X[i].dtype in ['int64', 'float64']]
object_cols = [i for i in X.columns if X[i].dtype == 'object']
# Making a copy of the original
X_numeric = X[num_cols].copy()
X_categoric = X[object_cols].copy()

################## PRE-PROCESSING - NUMERIC ####################################
#Impute the numeric values - Lot Frontage - Mean value
my_imputer1 = SimpleImputer(missing_values= np.NaN, strategy= 'mean')
#Impute the numeric values - MasVnrArea == 0
my_imputer2 = SimpleImputer(missing_values= np.NaN, strategy= 'constant',fill_value= 0)
X_numeric['LotFrontage'] = my_imputer1.fit_transform(X_numeric[['LotFrontage']])
X_numeric['MasVnrArea'] = my_imputer2.fit_transform(X_numeric[['MasVnrArea']])
# Dropping Garage Year Built
num_cols = [i for i in num_cols if i != 'GarageYrBlt']
X_numeric = X_numeric[num_cols]

X_test['LotFrontage'] = my_imputer1.transform(X_test[['LotFrontage']])
X_test['MasVnrArea'] = my_imputer2.transform(X_test[['MasVnrArea']])

X_test.drop(['GarageYrBlt'], axis= 1, inplace=True)

################## PRE-PROCESSING - CATEGORICAL ################################
my_imputer3 = SimpleImputer(strategy= 'constant',fill_value= 'None')
my_imputer4 = SimpleImputer(strategy= 'most_frequent')
my_label_encoder5 = LabelEncoder()
#
X_categoric['MasVnrType'] = my_imputer3.fit_transform(X_categoric[['MasVnrType']])
X_categoric['Electrical'] = my_imputer4.fit_transform(X_categoric[['Electrical']])
# Label Encoding for all other categorical variables
object_cols_le = [i for i in object_cols if i != 'MasVnrType' if i != 'Electrical']
#for col in object_cols_le:
X_categoric[object_cols_le] = my_imputer3.fit_transform(X_categoric[object_cols_le])
#
for data in object_cols:
  X_categoric[data] = my_label_encoder5.fit_transform(X_categoric[data])
# Combining categorical and numerical columns
X_encoded = pd.concat([X_categoric,X_numeric],axis= 1)

#### Test data encoding
X_test['MasVnrType'] = my_imputer3.fit_transform(X_test[['MasVnrType']])
X_test['Electrical'] = my_imputer4.fit_transform(X_test[['Electrical']])

for i in object_cols_le:
  X_test[i] = my_imputer4.fit_transform(X_test[[i]])

for i in object_cols:
  X_test[i] = my_label_encoder5.fit_transform(X_test[i])

#Align the columns
X_test = X_encoded.align(X_test, join='left', axis=1)
#print("The training sample is ",X_encoded)
#print("The test sample is ", X_test)
X_train, X_valid, y_train, y_valid = train_test_split(X_encoded, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)
# Stratified Sampling
#sample = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=0)
#train_index, test_index = sample.split(X_encoded,y)
#X_train, X_valid = X_encoded[train_index], X_encoded[test_index]
#y_train, y_valid = y[train_index], y[test_index]

####################### MODEL XGB REGRESSOR ##############################################
my_model = XGBRegressor(n_estimators = 1200, learning_rate = 0.07, base_score = 0.1,max_depth = 3)
my_model.fit(X_train,y_train)
# Get predictions
predictions = my_model.predict(X_valid)
# Calculate MAE
mae_3 = mean_absolute_error(predictions, y_valid)
#
print("Mean Absolute Error:" , mae_3)

#output = pd.DataFrame({'Id': X_test.index,
#                       'SalePrice': predictions})
#output.to_csv('submission.csv', index=False)

The shape pf training data is  (1460, 79)
The shape of test is  (1459, 79)
Mean Absolute Error: 15594.612906678081
