# `Training Data`

# Importing Necessary Modules

In [26]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, r2_score, root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import numpy as np

# Loading Dataset

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


# Exploration

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
df.columns.to_list

<bound method IndexOpsMixin.tolist of Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQ

In [5]:
def DroppingColumns(df):
       dropped_col_df = df.copy()
       dropped_col_df.drop(columns= ['Id', 'Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC',
              'Fence', 'MiscFeature', 'LotArea'], inplace= True)
       
       return dropped_col_df

In [6]:
df.nunique()

Id               1460
MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
                 ... 
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
SalePrice         663
Length: 81, dtype: int64

# preprocessing

In [7]:
def Imputer(df):
    imputed_df = df.copy()
    for i in imputed_df.columns:
        if imputed_df[i].dtype == 'object':
            imputed_df[i] = imputed_df[i].fillna(imputed_df[i].mode()[0])
        else:
            imputed_df[i] = imputed_df[i].fillna(imputed_df[i].mean())

    return imputed_df

    

def Encoder(df):
    encoded_df = df.copy()
    LE = LabelEncoder()
    for i in encoded_df.columns:
        if df[i].dtype == 'object':
            encoded_df[i] = LE.fit_transform(encoded_df[i])

    return encoded_df

def Normalizer(df):
    scalar = MinMaxScaler()
    scaled_df = pd.DataFrame(scalar.fit_transform(df), columns= df.columns)

    return scaled_df


df_imputed = Imputer(DroppingColumns(df))
X = df_imputed.drop(columns= ['SalePrice'])
Y = df_imputed['SalePrice']
normalized_df = Normalizer(Encoder(X))
normalized_df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.235294,0.75,0.150685,1.0,1.0,1.0,0.0,1.0,0.0,0.208333,...,0.111517,0.000000,0.0,0.0,0.0,0.00000,0.090909,0.50,1.0,0.8
1,0.000000,0.75,0.202055,1.0,1.0,1.0,0.0,0.5,0.0,1.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.00000,0.363636,0.25,1.0,0.8
2,0.235294,0.75,0.160959,1.0,0.0,1.0,0.0,1.0,0.0,0.208333,...,0.076782,0.000000,0.0,0.0,0.0,0.00000,0.727273,0.50,1.0,0.8
3,0.294118,0.75,0.133562,1.0,0.0,1.0,0.0,0.0,0.0,0.250000,...,0.063985,0.492754,0.0,0.0,0.0,0.00000,0.090909,0.00,1.0,0.0
4,0.235294,0.75,0.215753,1.0,0.0,1.0,0.0,0.5,0.0,0.625000,...,0.153565,0.000000,0.0,0.0,0.0,0.00000,1.000000,0.50,1.0,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.235294,0.75,0.140411,1.0,1.0,1.0,0.0,1.0,0.0,0.333333,...,0.073126,0.000000,0.0,0.0,0.0,0.00000,0.636364,0.25,1.0,0.8
1456,0.000000,0.75,0.219178,1.0,1.0,1.0,0.0,1.0,0.0,0.583333,...,0.000000,0.000000,0.0,0.0,0.0,0.00000,0.090909,1.00,1.0,0.8
1457,0.294118,0.75,0.154110,1.0,1.0,1.0,0.0,1.0,0.0,0.250000,...,0.109689,0.000000,0.0,0.0,0.0,0.16129,0.363636,1.00,1.0,0.8
1458,0.000000,0.75,0.160959,1.0,1.0,1.0,0.0,1.0,0.0,0.500000,...,0.000000,0.202899,0.0,0.0,0.0,0.00000,0.272727,1.00,1.0,0.8


In [8]:
normalized_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 72 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   float64
 1   MSZoning       1460 non-null   float64
 2   LotFrontage    1460 non-null   float64
 3   Street         1460 non-null   float64
 4   LotShape       1460 non-null   float64
 5   LandContour    1460 non-null   float64
 6   Utilities      1460 non-null   float64
 7   LotConfig      1460 non-null   float64
 8   LandSlope      1460 non-null   float64
 9   Neighborhood   1460 non-null   float64
 10  Condition1     1460 non-null   float64
 11  Condition2     1460 non-null   float64
 12  BldgType       1460 non-null   float64
 13  HouseStyle     1460 non-null   float64
 14  OverallQual    1460 non-null   float64
 15  OverallCond    1460 non-null   float64
 16  YearBuilt      1460 non-null   float64
 17  YearRemodAdd   1460 non-null   float64
 18  RoofStyl

In [9]:
Y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1460 entries, 0 to 1459
Series name: SalePrice
Non-Null Count  Dtype
--------------  -----
1460 non-null   int64
dtypes: int64(1)
memory usage: 11.5 KB


# Grid Search

In [10]:
# param_grid = {
#     'n_estimators': [100, 300, 500, 700],
#     'max_depth': [10, 20, 30, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4, 6],
#     'max_features': ['sqrt', 'log2']
# }

# grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='r2')
# grid_search.fit(normalized_df, Y)
# print("Best Parameters:", grid_search.best_params_)

# model training

In [11]:
RFR = RandomForestRegressor(random_state= 42, n_estimators= 100, max_depth= 20, min_samples_split= 2, min_samples_leaf= 1, max_features= 'sqrt')
RFR.fit(normalized_df, Y)

# Training data

In [12]:
training_prediction = RFR.predict(normalized_df)
training_accuracy = root_mean_squared_error(Y, training_prediction)

In [13]:
mean = Y.mean()

In [14]:
print("the accuracy is:",int(100 - ((training_accuracy*100)/mean)))

the accuracy is: 93


# `Testing Data`

In [15]:
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [16]:
test_target = pd.read_csv('sample_submission.csv')
test_target

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


In [17]:
con_test_df = pd.merge(test_df,test_target, on= 'Id')
con_test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,169277.052498
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,187758.393989
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,183583.683570
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,179317.477511
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,150730.079977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,167081.220949
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,164788.778231
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,219222.423400
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,184924.279659


In [18]:
test_df_imputed = Imputer(DroppingColumns(con_test_df))
test_X = test_df_imputed.drop(columns= ['SalePrice'])
test_Y = test_df_imputed['SalePrice']
normalized_test_df = Normalizer(Encoder(test_X))
normalized_test_df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.000000,0.50,0.329609,1.0,1.0,1.000000,0.0,1.0,0.0,0.500000,...,0.000000,0.0,0.0,0.208333,0.0,0.000000,0.454545,1.0,1.0,0.8
1,0.000000,0.75,0.335196,1.0,0.0,1.000000,0.0,0.0,0.0,0.500000,...,0.048518,0.0,0.0,0.000000,0.0,0.735294,0.454545,1.0,1.0,0.8
2,0.235294,0.75,0.296089,1.0,0.0,1.000000,0.0,1.0,0.0,0.333333,...,0.045822,0.0,0.0,0.000000,0.0,0.000000,0.181818,1.0,1.0,0.8
3,0.235294,0.75,0.318436,1.0,0.0,1.000000,0.0,1.0,0.0,0.333333,...,0.048518,0.0,0.0,0.000000,0.0,0.000000,0.454545,1.0,1.0,0.8
4,0.588235,0.75,0.122905,1.0,0.0,0.333333,0.0,1.0,0.0,0.916667,...,0.110512,0.0,0.0,0.250000,0.0,0.000000,0.000000,1.0,1.0,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.823529,1.00,0.000000,1.0,1.0,1.000000,0.0,1.0,0.0,0.416667,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.454545,0.0,1.0,0.8
1455,0.823529,1.00,0.000000,1.0,1.0,1.000000,0.0,1.0,0.0,0.416667,...,0.032345,0.0,0.0,0.000000,0.0,0.000000,0.272727,0.0,1.0,0.0
1456,0.000000,0.75,0.776536,1.0,1.0,1.000000,0.0,1.0,0.0,0.458333,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.727273,0.0,1.0,0.0
1457,0.382353,0.75,0.229050,1.0,1.0,1.000000,0.0,1.0,0.0,0.458333,...,0.043127,0.0,0.0,0.000000,0.0,0.041176,0.545455,0.0,1.0,0.8


In [19]:
test_prediction = RFR.predict(normalized_test_df)

In [20]:
test_accuracy = root_mean_squared_error(test_Y, test_prediction)
print("root mean square error is:",test_accuracy)

root mean square error is: 61317.1698993949


In [21]:
test_mean = test_Y.mean()
test_mean

179183.91824266256

In [22]:
print("the accuracy is:",(100 - int((test_accuracy*100)/ test_mean)))

the accuracy is: 66


In [25]:
test_prediction

array([128718.98      , 158151.73      , 181126.65      , ...,
       162053.02222222, 126234.96428571, 228844.77      ])

In [33]:
test_submission = pd.DataFrame({'Id': np.array(con_test_df['Id']),'SalePrice': test_prediction})
with open('test_submission.csv', 'w', newline= '') as file:
    test_submission.to_csv(file, index= False)