# Setup

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean

home_data = pd.read_csv('train.csv')

# Inspection and Cleaning

In [None]:
def cat_inspect(col: str):
    print(home_data[col].value_counts())
    print(home_data.groupby(col)['SalePrice'].mean())
    sns.boxplot(data=home_data, x=col, y='SalePrice')
    plt.show(); plt.clf()

mean_SalePrice = home_data['SalePrice'].mean()

home_data['BsmtQual'].fillna('NA', inplace=True)
home_data['LotFrontage'].fillna(0, inplace=True)

In [None]:
cat_inspect('MSZoning')

In [None]:
print(home_data['OverallQual'].value_counts())
print(home_data['OverallCond'].value_counts())

print(home_data.groupby('OverallQual')['SalePrice'].mean())
print(home_data.groupby('OverallCond')['SalePrice'].mean())

sns.scatterplot(data=home_data, x='OverallQual', y='SalePrice')
plt.show(); plt.clf()

print(pearsonr(home_data['OverallQual'], home_data['SalePrice'])[0])
print(pearsonr(home_data['OverallCond'], home_data['SalePrice'])[0])

print(pearsonr(home_data['OverallQual'], home_data['OverallCond'])[0])

sns.boxplot(data=home_data, x='OverallCond', y='SalePrice')


In [None]:
print(home_data['YearRemodAdd'].describe())

print(pearsonr(home_data['YearRemodAdd'], home_data['YearBuilt'])[0])
sns.scatterplot(data=home_data, x='YearBuilt', y='YearRemodAdd')
plt.show(); plt.clf()

print(home_data.loc[home_data['YearBuilt'] != home_data['YearRemodAdd']]['Id'].count())

sns.scatterplot(data=home_data.loc[home_data['YearBuilt'] != home_data['YearRemodAdd']], x='YearRemodAdd', y='SalePrice')
plt.show(); plt.clf()

print(pearsonr(home_data['YearRemodAdd'], home_data['SalePrice'])[0])
print(pearsonr(home_data['YearBuilt'], home_data['SalePrice'])[0])

remodel = pd.Series(np.where(home_data['YearBuilt'] != home_data['YearRemodAdd'], True, False))
print(pd.concat([home_data['SalePrice'], remodel], axis=1).groupby(remodel)['SalePrice'].mean())

In [None]:
print(home_data[['SalePrice', 'LotArea', '1stFlrSF', '2ndFlrSF', 'GrLivArea']].corr())
sns.heatmap(home_data[['SalePrice', 'LotArea', '1stFlrSF', '2ndFlrSF', 'GrLivArea']].corr(), annot=True)

In [None]:
cat_inspect('YrSold')

In [None]:
cat_inspect('FullBath')

In [None]:
cat_inspect('BedroomAbvGr')

In [None]:
cat_inspect('CentralAir')

In [None]:
cat_inspect('GarageCars')

In [None]:
cat_inspect('BsmtQual')

print(pd.crosstab(home_data['BsmtQual'], home_data['OverallCond']))

In [None]:
cat_inspect('Neighborhood')

# Feature Engineering

In [None]:
home_data['MSSubClass_Red'] = home_data['MSSubClass'].apply(lambda val: 'Other' if val not in [20, 60, 50] else val)

abs_neighb_price_diff = abs(home_data.groupby('Neighborhood')['SalePrice'].mean() - mean_SalePrice)
weighted_neighb_price_diff = (home_data['Neighborhood'].value_counts() * abs_neighb_price_diff).sort_values(ascending=False)
neighb_list = weighted_neighb_price_diff.index[:-4]
home_data['Neighborhood_Red'] = home_data['Neighborhood'].apply(lambda val: 'Other' if val not in neighb_list else val)

# Data Selection

In [None]:
y = home_data['SalePrice']
numerical_features = ['LotArea', 'YearBuilt', '1stFlrSF', 'FullBath', 'BedroomAbvGr', 'OverallQual', 'OverallCond', 'GrLivArea', 'GarageCars']
categorical_features = ['MSZoning', 'MSSubClass_Red', 'CentralAir', 'BsmtQual', 'Neighborhood_Red']

features_list = numerical_features + categorical_features

X_w_categorical = home_data[features_list]
X = pd.get_dummies(X_w_categorical, columns=categorical_features)

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Model Training and Tuning

In [None]:
n_estim_vals = [200]
max_feat_vals = [0.5]
max_dep_vals = [20]
min_samp_split_vals = [2]
max_leaf_vals = [500]
crit_vals = ['squared_error']

tests = 20
test_mae_list = [0 for _ in range(tests)]
results_list = []

for n_estim_val in n_estim_vals:
    for max_feat_val in max_feat_vals:
        for max_dep_val in max_dep_vals:
            for min_samp_split_val in min_samp_split_vals:
                for max_leaf_val in max_leaf_vals:
                    for crit_val in crit_vals:
                        for test in range(tests):
                            model = RandomForestRegressor(random_state=test, n_estimators=n_estim_val, max_features=max_feat_val, max_depth=max_dep_val,
                                                          min_samples_split=min_samp_split_val, max_leaf_nodes=max_leaf_val, criterion=crit_val)
                            model.fit(train_X, train_y)
                            pred_y = model.predict(val_X)
                            test_mae_list[test] = mean_absolute_error(val_y, pred_y)

                        mae = round(mean(test_mae_list), 2)
                        results_list.append([mae, n_estim_val, max_feat_val, max_dep_val, min_samp_split_val, max_leaf_val, crit_val])

results = pd.DataFrame(results_list, columns=['MAE', 'n_estimators', 'max_features', 'max_depth', 'min_samples_split', 'max_leaf_nodes', 'criterion'])
print("Complete")

In [None]:
print(results.sort_values('MAE', ascending=True).head(20))

In [None]:
repeats = 100
agg_mae_list = [0 for _ in range(repeats)]

for repeat in range(repeats):
    model = RandomForestRegressor(random_state=repeat, n_estimators=200, max_features=0.5, max_depth=20, max_leaf_nodes=500)
    model.fit(train_X, train_y)
    pred_y = model.predict(val_X)
    agg_mae_list[repeat] = mean_absolute_error(val_y, pred_y)
    
agg_mae = mean(agg_mae_list)
print(f"Mean absolute error: {agg_mae:.2f}")

# Final Model Predictions 

In [None]:
full_model = RandomForestRegressor(n_estimators=200, max_features=0.5, max_depth=20, max_leaf_nodes=500)
full_model.fit(X, y)

test_data = pd.read_csv('test.csv')

# test_data Prep
test_data['BsmtQual'].fillna('NA', inplace=True)
test_data['LotFrontage'].fillna(0, inplace=True)
test_data['MSSubClass_Red'] = test_data['MSSubClass'].apply(lambda val: 'Other' if val not in [20, 60, 50] else val)
test_data['Neighborhood_Red'] = test_data['Neighborhood'].apply(lambda val: 'Other' if val not in neighb_list else val)

test_data['GarageCars'].fillna(0, inplace=True)

test_X_w_categorical = test_data[features_list]
test_X = pd.get_dummies(test_X_w_categorical, columns=categorical_features)

test_preds = full_model.predict(test_X)

# Submission

In [None]:
output = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)