In [1]:
# import important packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from matplotlib import rcParams
import warnings

warnings.filterwarnings("ignore")

# figure size in inches
rcParams["figure.figsize"] = 10, 6
np.random.seed(42)

# Aim:
The aim of this attempt is to use the RandomForest Regression model to compare the scores of the two types of datasets, which are train and test, if the Sales Price (numeric data type)'s outcome is interrelated with the 79 columnn/covariants shown below: 

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('train.csv')
print(train_df.isnull().sum())
print("total number of Rows and Columns in the dataframe: " , train_df.shape)
unique_values = pd.Series({col: train_df[col].unique() for col in train_df})
unique_values.tolist()

print("------------"*10)
print(test_df.isnull().sum())
print("total number of Rows and Columns in the dataframe: " , test_df.shape)
unique_values = pd.Series({col: test_df[col].unique() for col in test_df})
unique_values.tolist()
test_df.info()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64
total number of Rows and Columns in the dataframe:  (1460, 81)
------------------------------------------------------------------------------------------------------------------------
Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64
total number of Rows and Columns in the dataframe:  (1460, 81)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
train_df.corr()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,1.0,0.011156,-0.010601,-0.033226,-0.028365,0.012609,-0.012713,-0.021998,-0.050298,-0.005024,...,-0.029643,-0.000477,0.002889,-0.046635,0.00133,0.057044,-0.006242,0.021172,0.000712,-0.021917
MSSubClass,0.011156,1.0,-0.386347,-0.139781,0.032628,-0.059316,0.02785,0.040581,0.022936,-0.069836,...,-0.012579,-0.0061,-0.012037,-0.043825,-0.02603,0.008283,-0.007683,-0.013585,-0.021407,-0.084284
LotFrontage,-0.010601,-0.386347,1.0,0.426095,0.251646,-0.059213,0.123349,0.088866,0.193458,0.233633,...,0.088521,0.151972,0.0107,0.070029,0.041383,0.206167,0.003368,0.0112,0.00745,0.351799
LotArea,-0.033226,-0.139781,0.426095,1.0,0.105806,-0.005636,0.014228,0.013788,0.10416,0.214103,...,0.171698,0.084774,-0.01834,0.020423,0.04316,0.077672,0.038068,0.001205,-0.014261,0.263843
OverallQual,-0.028365,0.032628,0.251646,0.105806,1.0,-0.091932,0.572323,0.550684,0.411876,0.239666,...,0.238923,0.308819,-0.113937,0.030371,0.064886,0.065166,-0.031406,0.070815,-0.027347,0.790982
OverallCond,0.012609,-0.059316,-0.059213,-0.005636,-0.091932,1.0,-0.375983,0.073741,-0.128101,-0.046231,...,-0.003334,-0.032589,0.070356,0.025504,0.054811,-0.001985,0.068777,-0.003511,0.04395,-0.077856
YearBuilt,-0.012713,0.02785,0.123349,0.014228,0.572323,-0.375983,1.0,0.592855,0.315707,0.249503,...,0.22488,0.188686,-0.387268,0.031355,-0.050364,0.00495,-0.034383,0.012398,-0.013618,0.522897
YearRemodAdd,-0.021998,0.040581,0.088866,0.013788,0.550684,0.073741,0.592855,1.0,0.179618,0.128451,...,0.205726,0.226298,-0.193919,0.045286,-0.03874,0.005829,-0.010286,0.02149,0.035743,0.507101
MasVnrArea,-0.050298,0.022936,0.193458,0.10416,0.411876,-0.128101,0.315707,0.179618,1.0,0.264736,...,0.159718,0.125703,-0.110204,0.018796,0.061466,0.011723,-0.029815,-0.005965,-0.008201,0.477493
BsmtFinSF1,-0.005024,-0.069836,0.233633,0.214103,0.239666,-0.046231,0.249503,0.128451,0.264736,1.0,...,0.204306,0.111761,-0.102303,0.026451,0.062021,0.140491,0.003571,-0.015727,0.014359,0.38642


In [5]:
# ref: # ref: https://towardsdatascience.com/iterative-imputation-with-scikit-learn-8f3eb22b1a38

categorical_columns = ['MSZoning', 'Street', 'Alley', 'LotShape', 'BsmtFinType2', 'Heating','HeatingQC','CentralAir','Electrical','KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','PavedDrive', 'PoolQC','Fence','MiscFeature','SaleType','SaleCondition','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle', 'RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1' ]



numeric_columns = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch' , '3SsnPorch','ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']   

    
X = train_df.drop(columns=["SalePrice"])
y = train_df["SalePrice"]

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42 )

#Create imputers and scalers: Create imputers to handle missing values and scalers to standardize numeric features.

imputer = SimpleImputer(strategy='median')
numeric_transformer = Pipeline(steps=[('imputer', imputer), ('scaler', StandardScaler())])

#Create a pipeline for categorical features: Create a pipeline to one-hot encode categorical features and select the top 50% of features based on chi-squared test.
cat_pipeline = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')), ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

# combined the pipelines
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_columns), ('cat', cat_pipeline, categorical_columns)])

# Fit and transform the data: Fit the preprocessor to your training data and transform both the training and testing datasets.

preprocessor.fit(X_train)
data_train_transformed1 = preprocessor.transform(X_train)
data_test_transformed1 = preprocessor.transform(X_test)

# Create and train the random forest regressor: Create a RandomForestRegressor and train it on the transformed training data.
rf_model = RandomForestRegressor()
rf_model.fit(data_train_transformed1, y_train)

#Evaluate and predict: Evaluate the model's performance on the transformed training data and use it to make predictions on the transformed testing data.
rf_model_score_trest3 = rf_model.score(data_train_transformed1, y_train)
predictions = rf_model.predict(data_test_transformed1)
rf_model_score_test4 = rf_model.score(data_test_transformed1, y_test)
print("Training score for Train_Dataset: ", rf_model_score_trest3)
print("---"*32)
print("Testing score for Train_Dataset: ", rf_model_score_test4)

Training score for Train_Dataset:  0.9768189059802519
------------------------------------------------------------------------------------------------
Testing score for Train_Dataset:  0.8911511700639638


In [6]:

categorical_columns = ['MSZoning', 'Street', 'Alley', 'LotShape', 'BsmtFinType2', 'Heating','HeatingQC','CentralAir','Electrical','KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','PavedDrive', 'PoolQC','Fence','MiscFeature','SaleType','SaleCondition','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle', 'RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1' ]



numeric_columns = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch' , '3SsnPorch','ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']   

    
X = test_df.drop(columns=["SalePrice"])
y = test_df["SalePrice"]

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42 )

#Create imputers and scalers: Create imputers to handle missing values and scalers to standardize numeric features.

imputer = SimpleImputer(strategy='median')
numeric_transformer = Pipeline(steps=[('imputer', imputer), ('scaler', StandardScaler())])

#Create a pipeline for categorical features: Create a pipeline to one-hot encode categorical features and select the top 50% of features based on chi-squared test.
cat_pipeline = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')), ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

# combined the pipelines
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_columns), ('cat', cat_pipeline, categorical_columns)])

# Fit and transform the data: Fit the preprocessor to your training data and transform both the training and testing datasets.

preprocessor.fit(X_train)
data_train_transformed = preprocessor.transform(X_train)
data_test_transformed = preprocessor.transform(X_test)

# Create and train the random forest regressor: Create a RandomForestRegressor and train it on the transformed training data.
rf_model = RandomForestRegressor()
rf_model.fit(data_train_transformed, y_train)

#Evaluate and predict: Evaluate the model's performance on the transformed training data and use it to make predictions on the transformed testing data.
rf_model_score_trest1 = rf_model.score(data_train_transformed, y_train)
predictions = rf_model.predict(data_test_transformed)
rf_model_score_test1 = rf_model.score(data_test_transformed, y_test)
print("Training score for Test_Dataset: ", rf_model_score_trest1)
print("---"*32)
print("Testing score for Test_Dataset: ", rf_model_score_test1)

Training score for Test_Dataset:  0.9789884829771611
------------------------------------------------------------------------------------------------
Testing score for Test_Dataset:  0.8949312459429893


# Conclusion: It is clearly seen that the test_dataset has better training and testing score as that of train_dataset's

In [7]:
Col0 = ["Training score", "Testing Score"]
Train_Dataset = [rf_model_score_trest3, rf_model_score_test4]
Test_Dataset = [rf_model_score_trest1, rf_model_score_test1]
data = {' ': Col0, 'Train_Dataset': Train_Dataset, 'Test_Dataset': Test_Dataset}
# Create DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Unnamed: 1,Train_Dataset,Test_Dataset
0,Training score,0.976819,0.978988
1,Testing Score,0.891151,0.894931
