## Baseline vs Model

In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load cleaned and encoded dataset
train3 = pd.read_csv('./clean_train.csv')
test3 = pd.read_csv('./clean_test.csv')

FileNotFoundError: [Errno 2] No such file or directory: './clean_train.csv'

## Train Test Split

In [None]:
# Split the data into features (X) and target variable (y)
y = train3['SalePrice']
X = train3.drop(columns=['SalePrice', 'Unnamed: 0'])  

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
train3.columns

## Scaling The Data

In [None]:
from sklearn.preprocessing import StandardScaler

# Define numerical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Define categorical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

# Make copies of the data frames
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Scale the numerical features
scaler = StandardScaler()
X_train_scaled[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])

## Establishing A Baseline Model

In [None]:
# Calculate the mean of the "SalePrice" feature
mean_sale_price = y_train.mean()

# Create baseline predictions using the mean
baseline_predictions = [mean_sale_price] * len(y_test)

# Evaluate the baseline predictions
from sklearn.metrics import mean_squared_error

baseline_mse = mean_squared_error(y_test, baseline_predictions)
baseline_rmse = baseline_mse ** 0.5

print("Baseline RMSE:", baseline_rmse)

## Fitting Linear, Lasso & Ridge Regression Models

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Initialize models
linear_reg_model = LinearRegression()
lasso_model = Lasso(alpha=10, max_iter=10000)  
ridge_model = Ridge(alpha=10, max_iter=10000)  

# Fit models
linear_reg_model.fit(X_train_scaled, y_train)
lasso_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)

# Make predictions 
linear_reg_predictions = linear_reg_model.predict(X_test_scaled)
lasso_predictions = lasso_model.predict(X_test_scaled)
ridge_predictions = ridge_model.predict(X_test_scaled)

# Evaluate models on test set
linear_reg_rmse = mean_squared_error(y_test, linear_reg_predictions) ** .5
lasso_rmse = mean_squared_error(y_test, lasso_predictions) ** .5
ridge_rmse = mean_squared_error(y_test, ridge_predictions) ** .5


linear_reg_r2 = r2_score(y_test, linear_reg_predictions)
lasso_r2 = r2_score(y_test, lasso_predictions)
ridge_r2 = r2_score(y_test, ridge_predictions)

# Print results
print("Linear Regression (Test Set):")
print("RMSE:", linear_reg_rmse)
print("R-squared Score:", linear_reg_r2)
print()
print("Lasso Regression (Test Set):")
print("RMSE:", lasso_rmse)
print("R-squared Score:", lasso_r2)
print()
print("Ridge Regression (Test Set):")
print("RMSE:", ridge_rmse)
print("R-squared Score:", ridge_r2)

In [None]:
X_train_scaled.isna().sum()

In [None]:
# Predictions on TRAINING set
linear_reg_predictions = linear_reg_model.predict(X_train_scaled)
lasso_predictions = lasso_model.predict(X_train_scaled)
ridge_predictions = ridge_model.predict(X_train_scaled)

# Evaluate on TRAINING set
linear_reg_rmse = mean_squared_error(y_train, linear_reg_predictions) ** .5
lasso_rmse = mean_squared_error(y_train, lasso_predictions) ** .5
ridge_rmse = mean_squared_error(y_train, ridge_predictions) ** .5

linear_reg_r2 = r2_score(y_train, linear_reg_predictions)
lasso_r2 = r2_score(y_train, lasso_predictions)
ridge_r2 = r2_score(y_train, ridge_predictions)

# Print results
print("Linear Regression: (Training Set)")
print("RMSE:", linear_reg_rmse)
print("R-squared Score:", linear_reg_r2)
print()
print("Lasso Regression (Training Set):")
print("RMSE:", lasso_rmse)
print("R-squared Score:", lasso_r2)
print()
print("Ridge Regression (Training Set):")
print("RMSE:", ridge_rmse)
print("R-squared Score:", ridge_r2)

In [None]:
linear_reg_model.coef_

## Model Analysis

All three models, Linear Regression, Lasso Regression, and Ridge Regression, achieved similar performance in terms of RMSE and R-squared score, indicating they fit the training data well and explained around 84% of the variance in the target variable.

Similarly, all models exhibit similar performance on the test set with slightly higher RMSE values and slightly lower R-squared scores compared to the training set. This suggests that the models generalize reasonably well to unseen data, though there is a slight drop in performance compared to the training set. Comparison:

There is little difference in performance between Linear Regression, Lasso Regression, and Ridge Regression, with each method yielding comparable results. However, Ridge Regression slightly outperforms the other methods on the test set in terms of RMSE and R-squared score. Overall, the models provide a reasonable fit to the data, but there may be opportunities for further refinement or exploration of different modeling techniques to potentially improve predictive performance.


## Final Recommendations

Assessing the value of a house is a challenging task, but I would recommend that buyers and sellers take into account the following factors. 

* The quality of materials that went into constructing the home
* The square footage of the above ground living area
* The amount of cars the garage can hold
* The neighborhood the home is located in as well as the year the home was built

#### Creating Kaggle CSV

In [None]:
final_columns = ['Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Mas Vnr Area', '1st Flr SF',
       '2nd Flr SF', 'Gr Liv Area', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'TotRms AbvGrd', 'Fireplaces', 'Garage Cars', 'Garage Area',
       'Wood Deck SF', 'Open Porch SF']

In [None]:
final_test = test3[final_columns] 

In [None]:
scaler = StandardScaler()
final_test[numerical_features] = scaler.fit_transform(final_test[numerical_features])

In [None]:
final_test['SalePrice'] = linear_reg_model.predict(final_test)

In [None]:
final_test['SalePrice']

In [None]:
final_test['Unnamed: 0'] = test3['Unnamed: 0']

In [None]:
final_test[['Unnamed: 0', 'SalePrice']]

In [None]:
submission = final_test[['Unnamed: 0', 'SalePrice']]

In [None]:
submission.to_csv('KaggleSubmission.csv', index = False)