In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

In [2]:
df=pd.read_csv("../Data/HouseImformation.csv")

In [3]:
df['Area'] = df['Area'].str.replace(',', '').astype(float)

df=df.dropna()

isoOuter=IsolationForest(contamination=0.01)
df['OuterData']=isoOuter.fit_predict(df[["Area", "Price"]])

df=df.drop(index=df[df['OuterData'] == -1].index)

MeanPriceAddressOuter=df.groupby("Address")["Price"].mean()
df["MeanPriceA"] = df["Address"].map(MeanPriceAddressOuter)

In [None]:
mask= np.random.rand(len(df)) < 0.8

train=df[mask]
test=df[~mask]

x_train=np.asanyarray(train[['Area','Room','Parking','Warehouse','Elevator','MeanPriceA']])
y_train=np.asanyarray(train[['Price']])

x_test=np.asanyarray(test[['Area','Room','Parking','Warehouse','Elevator','MeanPriceA']])
y_test=np.asanyarray(test[['Price']])

In [7]:
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('scale' , StandardScaler()),
    ('ridge', Ridge())
])

param_grid = {
    'ridge__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}
#param_grid = {
#    'ridge__alpha': [0.1]
#}
grid = GridSearchCV(pipeline, param_grid, cv=5)
grid.fit(x_train, y_train)

print("Best alpha:", grid.best_params_)

y_predict=grid.predict(x_test)

print("R2-score: %.2f" % r2_score(y_test , y_predict))

Best alpha: {'ridge__alpha': 10.0}
R2-score: 0.80


In [8]:
joblib.dump(grid, '../Models/Ridge&PolyRegressionModel.pkl')

['../Models/Ridge&PolyRegressionModel.pkl']