In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
df=pd.read_csv('../Data/HouseImformation.csv')

In [3]:
df['Area'] = df['Area'].str.replace(',', '').astype(float)

df=df.dropna()

isoOuter=IsolationForest(contamination=0.01)
df['OuterData']=isoOuter.fit_predict(df[["Area", "Price"]])

df=df.drop(index=df[df['OuterData'] == -1].index)

MeanPriceAddressOuter=df.groupby("Address")["Price"].mean()
df["MeanPriceA"] = df["Address"].map(MeanPriceAddressOuter)

In [28]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
x_Data = scaler.fit_transform(df[['Area','Room','Parking','Warehouse','Elevator','MeanPriceA']])
y_Data=df['Price'].values


In [38]:
mask= np.random.rand(len(df)) < 0.8

train=df[mask]
test=df[~mask]

x_train=np.asanyarray(train[['Area','Room','Parking','Warehouse','Elevator','MeanPriceA']])
y_train=np.asanyarray(train['Price'])

x_test=np.asanyarray(test[['Area','Room','Parking','Warehouse','Elevator','MeanPriceA']])
y_test=np.asanyarray(test['Price'])

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold ,cross_val_score

model=RandomForestRegressor(n_estimators=100,max_depth=10,random_state=42)

cv=RepeatedKFold(n_splits=10,n_repeats=3,random_state=1)

crossVal=cross_val_score(model,x_Data,y_Data,cv=cv)

print("Mean Is: ",crossVal.mean())
print("Standard deviation: ", np.std(crossVal))


Mean Is:  0.869367465078965
Standard deviation:  0.024110795426355896


In [39]:
model.fit(x_train,y_train)

y_predict=model.predict(x_test)

from sklearn.metrics import r2_score
print(f"Test R2 Score: {r2_score(y_test, y_predict):.2f}")

Test R2 Score: 0.87


In [40]:
pip=Pipeline([
    ('scale',StandardScaler()),
    ('poly',PolynomialFeatures(degree=2)),
    ('RandomForestRegressor',RandomForestRegressor())
])

cv=RepeatedKFold(n_splits=10,n_repeats=3,random_state=1)

crossVal=cross_val_score(pip,x_Data,y_Data,cv=cv)

print("Mean Is: ",crossVal.mean())
print("Standard deviation: ", np.std(crossVal))


Mean Is:  0.8496215762491919
Standard deviation:  0.03365605190966061


In [41]:
pip.fit(x_train,y_train)

y_predictR=pip.predict(x_test)

from sklearn.metrics import r2_score
print(f"Test R2 Score: {r2_score(y_test, y_predictR):.2f}")

Test R2 Score: 0.85
