In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tabulate import tabulate
 
# Load the training dataset
df = pd.read_csv("C:/Users/maths/Desktop/FML-AVI-230957164/Week 4/Housing.csv")
 
 
# Check if 'price' column exists
if 'price' not in df.columns:
    raise KeyError("The 'price' column is not found in the DataFrame.")
 
# Convert categorical variables to numerical using one-hot encoding
df = pd.get_dummies(df, drop_first=True)
 
# Separate features and target variable
X = df.drop(columns=["price"])
y = df["price"]
 
# Split the dataset into 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
 
# Save the new test dataset to a CSV file
test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv("C:\\Users\\maths\\Desktop\\FML-AVI-230957164\\Week 4\\HousePriceTest.csv", index=False)
 
# Standardize the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
 
# Train Ridge and Lasso regression models
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0)
 
ridge.fit(X_train_scaled, y_train)
lasso.fit(X_train_scaled, y_train)
 
# Predict on test data
y_pred_ridge = ridge.predict(X_test_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)
 
# Function to evaluate model performance
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {
        "Model": model_name,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2 Score": r2
    }
 
# Evaluate Ridge and Lasso models
ridge_results = evaluate_model(y_test, y_pred_ridge, "Ridge Regression")
lasso_results = evaluate_model(y_test, y_pred_lasso, "Lasso Regression")
 
# Tabulate performance
performance_df = pd.DataFrame([ridge_results, lasso_results])
print("\nModel Performance:\n")
print(tabulate(performance_df, headers='keys', tablefmt='pretty', floatfmt=".2f"))
 
# Ensure test data has the same features as training data
missing_cols = set(X.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0  # Add missing columns with default value
 
# Reorder columns to match training data
test_df = test_df[X.columns]
 
 
# Standardize test data
# Here we check if 'price' is in the columns before dropping it
if 'price' in test_df.columns:
    X_unseen_scaled = scaler.transform(test_df.drop(columns=["price"]))
else:
    X_unseen_scaled = scaler.transform(test_df)
 
# Make predictions on unseen data
y_pred_ridge_unseen = ridge.predict(X_unseen_scaled)
y_pred_lasso_unseen = lasso.predict(X_unseen_scaled)
 
# Save predictions to CSV
predictions_df = pd.DataFrame({
    "Ridge Predictions": y_pred_ridge_unseen,
    "Lasso Predictions": y_pred_lasso_unseen
})
predictions_df.to_csv("C:\\Users\\maths\\Desktop\\FML-AVI-230957164\\Week 4\\HousePricePredictions.csv", index=False)


Model Performance:

+---+------------------+-------------------+--------------------+--------------------+--------------------+
|   |      Model       |        MAE        |        MSE         |        RMSE        |      R2 Score      |
+---+------------------+-------------------+--------------------+--------------------+--------------------+
| 0 | Ridge Regression | 920170.5517600026 | 1522881286264.3906 | 1234050.7632445234 | 0.6463671758328557 |
| 1 | Lasso Regression | 920392.7500197398 | 1523019775772.074  | 1234106.8737236958 | 0.6463350167695321 |
+---+------------------+-------------------+--------------------+--------------------+--------------------+
