In [193]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

stuff I want to try differently this time:

-implementing a pipeline

-removing correlated features

-use logistic scaling on target value

-use boxcox scaling on skewed features

In [194]:
df = pd.read_csv('train.csv')

df.drop("Id", inplace=True, axis=1)
df.head()

num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [195]:
#removing correlated features 
corr = df[num_cols].corr()
# plt.figure(figsize=(16, 12))
# sns.heatmap(corr, annot=False, cmap='coolwarm', center=0)
# plt.title('Feature Correlation Heatmap')
# plt.show()

corrs = corr.abs().unstack().sort_values(ascending=False)
corrs = corrs[corrs != 1]

print("FEATURES THAT ARE HIGHLY CORRELATED TO EACH OTHER SUMMARY:\n", "="*60)
i = 0
for x in corrs:
    print(f"{corrs.index[i]} have a correlation of: {x}")
    i += 1
    if i == 16:  
        break

# Removing highly correlated features
drop_cols = ['GarageCars','GarageYrBlt',]
df.drop(drop_cols, axis=1, inplace=True)

FEATURES THAT ARE HIGHLY CORRELATED TO EACH OTHER SUMMARY:
('GarageCars', 'GarageArea') have a correlation of: 0.882475414281462
('GarageArea', 'GarageCars') have a correlation of: 0.882475414281462
('YearBuilt', 'GarageYrBlt') have a correlation of: 0.8256674841743408
('GarageYrBlt', 'YearBuilt') have a correlation of: 0.8256674841743408
('GrLivArea', 'TotRmsAbvGrd') have a correlation of: 0.8254893743088425
('TotRmsAbvGrd', 'GrLivArea') have a correlation of: 0.8254893743088425
('TotalBsmtSF', '1stFlrSF') have a correlation of: 0.8195299750050339
('1stFlrSF', 'TotalBsmtSF') have a correlation of: 0.8195299750050339
('SalePrice', 'OverallQual') have a correlation of: 0.7909816005838053
('OverallQual', 'SalePrice') have a correlation of: 0.7909816005838053
('GrLivArea', 'SalePrice') have a correlation of: 0.7086244776126515
('SalePrice', 'GrLivArea') have a correlation of: 0.7086244776126515
('2ndFlrSF', 'GrLivArea') have a correlation of: 0.6875010641666033
('GrLivArea', '2ndFlrSF') h

In [196]:
X = df.drop("SalePrice", axis=1)
y = np.log1p(df["SalePrice"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [197]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('yeojohnson', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = LinearRegression()

bundle = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

bundle.fit(X_train, y_train)

predictions = bundle.predict(X_test)

print('MAE:', mean_absolute_error(y_test, predictions))


MAE: 0.08425502219844394


First MAE: 19488.45336180804

Second MAE: MAE: 0.08455341302003157 (after log scaling the sales price WTF...)

third MAE: MAE: 0.11195828082943877 (after dropping 5 correlated features)

fourth MAE: MAE: 0.08428716889073772 (after dropping 3 correlated features)

***fith MAE: MAE: 0.08425502219844394 (after dropping 2 correlated features)

sixth MAE: MAE: 0.08474545986700253 (after dropping 1 correlated features)

seventh MAE: MAE: 0.08931800056467047 (removing yeo johnston from pipeline, 2 corr features)

eigth MAE: MAE: 0.08430445415454502 (after dropping standard scalar, 2 corr features)

ninth MAE: MAE: 0.09735830740464979 (after dropping both, 2 corr features)

tenth MAE: MAE: 0.11378258438385334 (after changing imputing method to median)

eleventh MAE: MAE: 0.08436959715402965 (after chaning imputing method to mean)













































In [198]:
print("TRAINING ATTEMPT 2 SUMMARY:\n", "="*75)
original_mae = 27760.797752343573
first_attempt_mae = 19488.45336180804
best_mae = 0.08425502219844394

improvement_first = ((original_mae - first_attempt_mae) / original_mae) * 100
improvement_best = ((original_mae - best_mae) / original_mae) * 100

factor_first = original_mae / first_attempt_mae
factor_best = original_mae / best_mae

percent_first = (original_mae / first_attempt_mae) * 100
percent_best = (original_mae / best_mae) * 100
perecent = (first_attempt_mae / best_mae)
print(f"Original MAE is {percent_first:.2f}% of first attempt MAE.")
print(f"Original MAE is {percent_best:.2f}% of best attempt MAE.")
print(f"From 1st attempt to best attempt, the MAE improved by {perecent:.2f}%")
print("\n\nyay!\n", "="*75)

TRAINING ATTEMPT 2 SUMMARY:
Original MAE is 142.45% of first attempt MAE.
Original MAE is 32948537.70% of best attempt MAE.
From 1st attempt to best attempt, the MAE improved by 231303.17%


yay!


In [199]:
#TODO: note some observations from above..
# main Q-> why was logistic scaling so impactful and understand it and understand code esp pipeline
# next step trying different models and incorperate hyperparameter tuning,
# features selection and cross-validation
# look into finding a way to automate this process for the future