In [None]:
import pandas as pd
import pandas_profiling as pp

import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

import itertools

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge

import lightgbm as lgb

# Data exploration

In [None]:
df = pd.read_csv("data/ames.csv")
df = df[["LotArea", "OverallQual", "OverallCond", "YearBuilt", "TotalBsmtSF",
         "1stFlrSF", "2ndFlrSF", "Fireplaces", "GarageCars", "OpenPorchSF", "SalePrice"]]

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df["SalePrice"].hist()

In [None]:
sns.clustermap(df.corr());

In [None]:
sns.catplot(data=df,
            x="GarageCars", y="SalePrice",
            col="Fireplaces",
            kind="bar",
            col_wrap=2);

In [None]:
pp.ProfileReport(df).to_file(output_file="report.html")

# Regression task

In [None]:
X = df.drop("SalePrice", axis=1)
Y = df["SalePrice"]

## Train/test split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=12)

In [None]:
len(X), len(X_train), len(Y_test)

## Approach 0: constant model

In [None]:
Y_pred_const = np.full(Y_test.shape, Y_train.mean())
Y_pred_const[:3]

In [None]:
np.sqrt(mean_squared_error(Y_test, Y_pred_const))

In [None]:
r2_score(Y_test, Y_pred_const)

In [None]:
def plot_predictions(Y_true, Y_predicted):
    df_pred = pd.DataFrame({"true": Y_true, "predicted": Y_predicted})
    val_min = 0
    val_max = 1000000
    figure = sns.jointplot('true', 'predicted', df_pred,
                           xlim=(val_min, val_max),
                           ylim=(val_min, val_max))
    figure.ax_joint.plot([val_min, val_max], [val_min, val_max], ':k') 
    figure.fig.set_figheight(8)
    figure.fig.set_figwidth(8)

In [None]:
plot_predictions(Y_test, Y_pred_const)

## Approach 1: linear regression model

### Preprocessing

In [None]:
X_train_lr = X_train.copy()
X_test_lr  = X_test.copy()

In [None]:
X_train_lr["LotArea"] = np.log10(X_train_lr["LotArea"])
X_test_lr["LotArea"]  = np.log10(X_test_lr["LotArea"])

X_train_lr["OpenPorchSF"] = [np.log10(value) if value > 0 else 0 for value in X_train_lr["OpenPorchSF"]]
X_test_lr["OpenPorchSF"]  = [np.log10(value) if value > 0 else 0 for value in X_test_lr["OpenPorchSF"]]

In [None]:
columns = X_train_lr.columns

for col1, col2 in itertools.combinations(columns, 2):
    X_train_lr[col1 + '_x_' + col2] = X_train_lr[col1] * X_train_lr[col2]
    X_test_lr[col1 + '_x_' + col2]  = X_test_lr[col1] * X_test_lr[col2]
    
for col in columns:
    X_train_lr[col + '^2'] = X_train_lr[col] ** 2
    X_test_lr[col + '^2'] = X_test_lr[col] ** 2

In [None]:
X_test_lr = (X_test_lr - X_train_lr.mean()) / X_train_lr.std()
X_train_lr = (X_train_lr - X_train_lr.mean()) / X_train_lr.std()

In [None]:
X_train_lr.head()

### Training

In [None]:
reg = Ridge(random_state=12)

In [None]:
reg.fit(X_train_lr, Y_train)

In [None]:
Y_pred_lr = reg.predict(X_test_lr)

In [None]:
np.sqrt(mean_squared_error(Y_test, Y_pred_lr))

In [None]:
r2_score(Y_test, Y_pred_lr)

In [None]:
plot_predictions(Y_test, Y_pred_lr)

## Approach 2: LightGBM

### No preprocessing, just training

In [None]:
lgb_model = lgb.LGBMRegressor(random_state=12)

In [None]:
lgb_model.fit(X_train, Y_train)

In [None]:
Y_pred_lgb = lgb_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(Y_test, Y_pred_lgb))

In [None]:
r2_score(Y_test, Y_pred_lgb)

In [None]:
plot_predictions(Y_test, Y_pred_lgb)