third attempt

In [5]:
#TODO:
# - implement cross validation 
# - tune hyperparameters
# - try more complex models
# - feature selection
# - more organized pipeline

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge



In [7]:
df = pd.read_csv('train.csv')

df.drop("Id", inplace=True, axis=1)
df.head()

num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

#removing correlated features 
corr = df[num_cols].corr()
# plt.figure(figsize=(16, 12))
# sns.heatmap(corr, annot=False, cmap='coolwarm', center=0)
# plt.title('Feature Correlation Heatmap')
# plt.show()

corrs = corr.abs().unstack().sort_values(ascending=False)
corrs = corrs[corrs != 1]

print("FEATURES THAT ARE HIGHLY CORRELATED TO EACH OTHER SUMMARY:\n", "="*60)
i = 0
for x in corrs:
    print(f"{corrs.index[i]} have a correlation of: {x}")
    i += 1
    if i == 16:  
        break

# Removing highly correlated features
drop_cols = ['GarageCars','GarageYrBlt',]
df.drop(drop_cols, axis=1, inplace=True)

FEATURES THAT ARE HIGHLY CORRELATED TO EACH OTHER SUMMARY:
('GarageCars', 'GarageArea') have a correlation of: 0.882475414281462
('GarageArea', 'GarageCars') have a correlation of: 0.882475414281462
('YearBuilt', 'GarageYrBlt') have a correlation of: 0.8256674841743408
('GarageYrBlt', 'YearBuilt') have a correlation of: 0.8256674841743408
('GrLivArea', 'TotRmsAbvGrd') have a correlation of: 0.8254893743088425
('TotRmsAbvGrd', 'GrLivArea') have a correlation of: 0.8254893743088425
('TotalBsmtSF', '1stFlrSF') have a correlation of: 0.8195299750050339
('1stFlrSF', 'TotalBsmtSF') have a correlation of: 0.8195299750050339
('SalePrice', 'OverallQual') have a correlation of: 0.7909816005838053
('OverallQual', 'SalePrice') have a correlation of: 0.7909816005838053
('GrLivArea', 'SalePrice') have a correlation of: 0.7086244776126515
('SalePrice', 'GrLivArea') have a correlation of: 0.7086244776126515
('2ndFlrSF', 'GrLivArea') have a correlation of: 0.6875010641666033
('GrLivArea', '2ndFlrSF') h

In [8]:
X = df.drop("SalePrice", axis=1)
y = np.log1p(df["SalePrice"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('yeojohnson', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

models = {
    'Linear': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge()
}

for name, model in models.items():
    bundle = Pipeline(steps=[
        ('preprocessor', preprocessor),
        (name.lower(), model)
    ])
    scores = cross_val_score(bundle, X, y, cv=5, scoring='neg_mean_absolute_error')
    print(f"{name} CV MAE: {-scores.mean():.5f}")


Linear CV MAE: 4416192730.88138
Lasso CV MAE: 0.30989
Lasso CV MAE: 0.30989
Ridge CV MAE: 5228989933.89106
Ridge CV MAE: 5228989933.89106


In [None]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    maes = []
    for train_idx, test_idx in kf.split(X):
        X_train_cv, X_test_cv = X.iloc[train_idx], X.iloc[test_idx]
        y_train_cv, y_test_cv = y.iloc[train_idx], y.iloc[test_idx]
        bundle = Pipeline(steps=[
            ('preprocessor', preprocessor),
            (name.lower(), model)
        ])
        bundle.fit(X_train_cv, y_train_cv)
        preds = bundle.predict(X_test_cv)
        mae = mean_absolute_error(y_test_cv, preds)
        maes.append(mae)
    print(f"{name} MAE (CV splits): {maes}")
    print(f"{name} Mean MAE: {np.mean(maes):.5f}")

Linear MAE (CV splits): [0.1123580990866465, 0.08137400741729582, 0.09298220206107188, 0.08535709926890397, 25748928577.65387]
Linear Mean MAE: 5149785715.60519
Lasso MAE (CV splits): [0.33713389241665187, 0.3021658789730622, 0.2925313333043492, 0.3293569525343518, 0.28902746976722204]
Lasso Mean MAE: 0.31004
Lasso MAE (CV splits): [0.33713389241665187, 0.3021658789730622, 0.2925313333043492, 0.3293569525343518, 0.28902746976722204]
Lasso Mean MAE: 0.31004
Ridge MAE (CV splits): [0.08552965895841952, 0.0809379274260034, 0.0882071707901046, 0.08604581604691014, 24449369851.541565]
Ridge Mean MAE: 4889873970.37646
Ridge MAE (CV splits): [0.08552965895841952, 0.0809379274260034, 0.0882071707901046, 0.08604581604691014, 24449369851.541565]
Ridge Mean MAE: 4889873970.37646
