In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv("/kaggle/input/chicago-house-price/realest.csv")

In [None]:
df.head(10)

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

# Replacing NaN values using SimpleImputer

First 8 columns have their Null values replaced using mean values, NULL values in the last column are substituted by the most frequent one

In [None]:
imp = SimpleImputer(strategy = 'mean')
df.iloc[:, :-1] = imp.fit_transform(df.iloc[:, :-1])
df['Condition'].fillna(df['Condition'].value_counts().index[0], inplace=True)

In [None]:
df.isna().sum()

In [None]:
plt.pie(df['Condition'].value_counts(),
       labels = df['Condition'].value_counts().index,
       autopct='%0.2f%%')

In [None]:
def plots(df, x, y='Condition'):
    group = df.groupby(y)
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 10))
    sns.histplot(df,x=x,kde=True,ax=axes[0])
    sns.histplot(df,x=x,hue=y,kde=True,ax=axes[1])
    sns.barplot(x=group[x].mean().index,
               y=group[x].mean(),
               ax=axes[2])
    
    for container in axes[2].containers:
        axes[2].bar_label(container, size=15, color='black')
        
    plt.suptitle("{} grouped by {}".format(x, y), size=20)
        
    plt.tight_layout()
    plt.show()

# Histograms and barplots grouped by Condition for ech numerical attribute

Genral histplots to view data distribution

In [None]:
for i in df.columns[:-1]:
    plots(df, i)

# Cheking for outliers using boxplot

In [None]:
sns.boxplot(df, x='Price')

In [None]:
sns.pairplot(df, vars=df.columns[:-1], hue='Condition')

In [None]:
x = df.iloc[:, 1:].values
y = df.iloc[:, 1].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

# Training and evaluation pipeline

In [None]:
def training(model, name):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print("{}\nMAE:{}\nMSE:{}\nr^2:{}".format(name, mae, mse, r2))
    print()
    return model

# Defining model and tuning their parameters

In [None]:
rfr = RandomForestRegressor(criterion='friedman_mse', max_depth=30)
dtr = DecisionTreeRegressor(criterion='squared_error', max_depth=30)
lnr = LinearRegression()
abr = AdaBoostRegressor(learning_rate=0.1)
svr = SVR(C=0.1)
knr = KNeighborsRegressor(n_neighbors=5)
xgb = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

models = [rfr, dtr, lnr, abr, svr, knr, xgb]
names = ['Random Forest', 'Decision Tree', 'Linear Regression', 'Ada Boost', 
        'SVM', 'KNeighbors', 'XGB']

# Results

In [None]:
trained = []
for i, j in zip(models, names):
    trained += [training(i, j)]