## This project is an implementation of the EDA done in my EDA-portfolio "house_price_analysis"

## Key Changes Made to the train data:

### Explicit Target Variable: Changed "SalePrice" to "log-transformed sale price (SalePrice_log)". This highlights the critical transformation performed.

### EDA Corrections: addressing initial issues of target variable skewness and heteroscedasticity. This fixed data problems.

### Methods used to find significant features: correlation analysis, t-tests, and ANOVA.

### Features: Included a few specific examples of "high-impact features" found.

### Dataset : cleaned and appropriately transformed dataset.


In [39]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split,KFold
import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import mean_squared_error, r2_score

In [40]:
df = pd.read_csv("processed_train.csv")
test = pd.read_csv('test.csv')
pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',100)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,GarageAge,SalePrice_log
0,1,60,RL,65.0,8450,Pave,no feature,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,no feature,Attchd,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,no feature,no feature,no feature,0,2,2008,WD,Normal,208500,5.0,12.247694
1,2,20,RL,80.0,9600,Pave,no feature,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,no feature,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,no feature,no feature,no feature,0,5,2007,WD,Normal,181500,31.0,12.109011
2,3,60,RL,68.0,11250,Pave,no feature,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,no feature,no feature,no feature,0,9,2008,WD,Normal,223500,7.0,12.317167
3,4,70,RL,60.0,9550,Pave,no feature,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,no feature,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,no feature,no feature,no feature,0,2,2006,WD,Abnorml,140000,8.0,11.849398
4,5,60,RL,84.0,14260,Pave,no feature,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,no feature,no feature,no feature,0,12,2008,WD,Normal,250000,8.0,12.429216


In [41]:
# re-do the same changes I did with train.csv in my EDA

miss_col = test.columns[test.isnull().sum() > 0]
for i in miss_col:
    # if the feature is numerical we raplace it by 0
    if pd.api.types.is_numeric_dtype(test[i]) == True :
        if i == "LotFrontage":
            test[i] = test[i].fillna(np.median(test[i]))
        elif i == "GarageYrBlt":
            # we will create a new features called garageAge so that the -1 value will mean no garage instead of Year 0
            test[i] = test[i].fillna(-1)
            # iterate trough each row the if/else condition and assign it to the new feature
            test["GarageAge"] = test.apply(lambda row: row["YrSold"] - row[i] if row[i] > 0 else -1,axis=1)
            test = test.drop("GarageYrBlt",axis='columns')
        else:
            test[i] = test[i].fillna(0)
    # the object features we add "no feature"
    else :
        test[i] = test[i].fillna("no feature")

In [42]:
for i in df.columns:
    if df[i].dtype == "object" :
        print(f"{i}: {df[i].unique()}")

MSZoning: ['RL' 'RM' 'C (all)' 'FV' 'RH']
Street: ['Pave' 'Grvl']
Alley: ['no feature' 'Grvl' 'Pave']
LotShape: ['Reg' 'IR1' 'IR2' 'IR3']
LandContour: ['Lvl' 'Bnk' 'Low' 'HLS']
Utilities: ['AllPub' 'NoSeWa']
LotConfig: ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
LandSlope: ['Gtl' 'Mod' 'Sev']
Neighborhood: ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
Condition1: ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
Condition2: ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
BldgType: ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
HouseStyle: ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
RoofStyle: ['Gable' 'Hip' 'Gambrel' 'Other' 'Flat' 'Shed']
RoofMatl: ['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']
Exterior1s

## one hot encoding

In [43]:
for i in df.columns:
    if df[i].dtype == "object" and set(df[i]).issubset(['Ex','Gd','TA','Fa','Po','no feature']):
        df[i] = df[i].replace({'Ex' : 5,'Gd' : 4,'TA' : 3,'Fa' : 2,'Po' : 1,'no feature' : 0})

  df[i] = df[i].replace({'Ex' : 5,'Gd' : 4,'TA' : 3,'Fa' : 2,'Po' : 1,'no feature' : 0})


In [44]:
for i in test.columns:
    if test[i].dtype == "object" and set(test[i]).issubset(['Ex','Gd','TA','Fa','Po','no feature']):
        test[i] = test[i].replace({'Ex' : 5,'Gd' : 4,'TA' : 3,'Fa' : 2,'Po' : 1,'no feature' : 0})

  test[i] = test[i].replace({'Ex' : 5,'Gd' : 4,'TA' : 3,'Fa' : 2,'Po' : 1,'no feature' : 0})


In [45]:
categorical_columns = df.select_dtypes(include='object').columns
# we drop the first column to reduce the size of the dataframe, if we have n categories we only need n-1 columns to represent it
df = pd.get_dummies(df,columns=categorical_columns,dtype=int,drop_first=True)

In [46]:
categorical_columns = test.select_dtypes(include='object').columns
# we drop the first column to reduce the size of the dataframe, if we have n categories we only need n-1 columns to represent it
test = pd.get_dummies(test,columns=categorical_columns,dtype=int,drop_first=True)

In [47]:
df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageCars,GarageArea,GarageQual,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MiscVal,MoSold,YrSold,SalePrice,GarageAge,SalePrice_log,MSZoning_FV,MSZoning_RH,...,Heating_Wall,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_Other,Electrical_SBrkr,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_no feature,GarageFinish_RFn,GarageFinish_Unf,GarageFinish_no feature,GarageCond_Fa,GarageCond_Other,GarageCond_Po,GarageCond_TA,GarageCond_no feature,PavedDrive_P,PavedDrive_Y,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_no feature,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MiscFeature_no feature,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_Other,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,4,3,4,3,706,0,150,856,5,856,854,0,1710,1,0,2,1,3,1,4,8,0,0,2,548,3,0,61,0,0,0,0,0,0,2,2008,208500,5.0,12.247694,0,0,...,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,3,3,4,3,978,0,284,1262,5,1262,0,0,1262,0,1,2,0,3,1,3,6,1,3,2,460,3,298,0,0,0,0,0,0,0,5,2007,181500,31.0,12.109011,0,0,...,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,4,3,4,3,486,0,434,920,5,920,866,0,1786,1,0,2,1,3,1,4,6,1,3,2,608,3,0,42,0,0,0,0,0,0,9,2008,223500,7.0,12.317167,0,0,...,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,3,3,3,4,216,0,540,756,4,961,756,0,1717,1,0,1,0,3,1,4,7,1,4,3,642,3,0,35,272,0,0,0,0,0,2,2006,140000,8.0,11.849398,0,0,...,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,4,3,4,3,655,0,490,1145,5,1145,1053,0,2198,1,0,2,1,4,1,4,9,1,3,3,836,3,192,84,0,0,0,0,0,0,12,2008,250000,8.0,12.429216,0,0,...,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0


In [48]:
# check if there is any Non-numeric columns
for col in df.columns:
    if not pd.api.types.is_numeric_dtype(df[col]):
        print(f"Non-numeric column: {col}")


In [49]:
x = df.drop(['SalePrice','SalePrice_log'],axis=1)
y = df['SalePrice_log']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [50]:
model = RandomForestRegressor()
param_dist = {'n_estimators': randint(50,1000),
              'max_depth': randint(1,40),
              'min_samples_split': randint(2, 10),
              'min_samples_leaf': randint(1, 5)}
random_search=RandomizedSearchCV(model,param_distributions=param_dist,scoring='neg_mean_squared_error',n_iter=50,cv=5,random_state=42,n_jobs=-1)
random_search.fit(x_train,y_train)
best_model=random_search.best_estimator_
#It will score them based on negative MSE (since lower MSE is better, but sklearn's scorer needs to maximize something)
#Added min_samples_split and min_samples_leaf — both control overfitting and improve tree generalization.

In [51]:
prediction = best_model.predict(x_test)

In [52]:
# Evaluate the model
mse = mean_squared_error(y_test, prediction)
r2 = r2_score(y_test, prediction)

print(f'Mean Squared Error (MSE): {mse}')
print(f'R^2 Score: {r2}')
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

Mean Squared Error (MSE): 0.023415500670743678
R^2 Score: 0.8796776847125465
RMSE: 0.15302124254737862


Mean Squared Error (MSE): 672533587.4205245
R^2 Score: 0.9099445343017578
RMSE: 25933.252542257873

In [53]:
test = test.reindex(columns=x_train.columns, fill_value=0)

1797406887.930039

In [54]:
price_prediction_log = best_model.predict(test)
# the model predict on the log scale
price_prediction = np.exp(price_prediction_log)
test['SalePrice'] = price_prediction 

In [55]:
submission = test[['Id', 'SalePrice']]
submission.to_csv("final_submission_house_pre_random_forest.csv", index=False)
