In [1]:
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv('/Users/arnav/Downloads/CarPrices/CarPrice_Assignment.csv')

# Preprocess the data
data = data.drop(columns=['car_ID'])

# Convert categorical columns to numerical using one-hot encoding
data = pd.get_dummies(data, drop_first=True)

# Separate the target variable (price) and features (X)
X = data.drop(columns=['price'])
y = data['price']

# Convert all features to numeric, coercing errors to NaN
X = X.apply(pd.to_numeric, errors='coerce')

# Fill missing values with column means
X.fillna(X.mean(), inplace=True)
y = pd.to_numeric(y, errors='coerce').fillna(y.mean())

# Remove outliers based on price (e.g., prices above 3 standard deviations)
mean_price = y.mean()
std_price = y.std()
mask = (y > mean_price - 3 * std_price) & (y < mean_price + 3 * std_price)
X = X[mask]
y = y[mask]

# Drop any rows with NaN values after filtering
X = X.dropna()
y = y.loc[X.index]  # Align y with the filtered X

# Feature Engineering Enhancements
if 'engine_size' in X.columns:
    X['log_engine_size'] = np.log(X['engine_size'] + 1)

if 'horsepower' in X.columns:
    X['horsepower_squared'] = np.log(X['horsepower'] + 1) ** 2

if 'log_engine_size' in X.columns and 'horsepower' in X.columns:
    X['engine_horsepower_interaction'] = X['log_engine_size'] * np.log(X['horsepower'] + 1)

if 'year' in X.columns:
    X['year_bins'] = pd.cut(X['year'], bins=5, labels=False)

# Remove features with zero variance
zero_variance_columns = X.columns[X.var() == 0]
X.drop(columns=zero_variance_columns, inplace=True)

# Remove highly correlated features
def remove_highly_correlated_features(X, threshold=0.9):
    correlation_matrix = X.corr().abs()
    features_to_drop = set()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if correlation_matrix.iloc[i, j] > threshold:
                colname = correlation_matrix.columns[i]
                features_to_drop.add(colname)
    X = X.drop(columns=features_to_drop)
    return X

X = remove_highly_correlated_features(X, threshold=0.9)

# Remove features with low variance
low_variance_columns = X.columns[X.var() < 1e-3]
X.drop(columns=low_variance_columns, inplace=True)

# Split the dataset into training and testing sets (80% train, 20% test)
train_size = int(0.8 * X.shape[0])
X_train = X[:train_size]
X_test = X[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

# Feature Scaling: Standardize the data manually (mean=0, std=1) for each feature
def standardize(X_train, X_test):
    X_train = X_train.astype(float)
    X_test = X_test.astype(float)

    X_train_mean = np.mean(X_train, axis=0)
    X_train_std = np.std(X_train, axis=0)
    
    X_train_std[X_train_std == 0] = 1

    X_train_scaled = (X_train - X_train_mean) / X_train_std
    X_test_scaled = (X_test - X_train_mean) / X_train_std
    return X_train_scaled, X_test_scaled

X_train_scaled, X_test_scaled = standardize(X_train.to_numpy(), X_test.to_numpy())

# Ridge Regression with cross-validation to find optimal lambda
def ridge_regression_cv(X, y, alphas, k_folds=5):
    n_samples, n_features = X.shape
    fold_size = n_samples // k_folds
    mse_scores = np.zeros((len(alphas), k_folds))

    for i, alpha in enumerate(alphas):
        for k in range(k_folds):
            start_idx = k * fold_size
            end_idx = (k + 1) * fold_size
            X_train_fold = np.concatenate([X[:start_idx], X[end_idx:]], axis=0)
            y_train_fold = np.concatenate([y[:start_idx], y[end_idx:]], axis=0)
            X_val_fold = X[start_idx:end_idx]
            y_val_fold = y[start_idx:end_idx]

            theta = ridge_regression(X_train_fold, y_train_fold, alpha)
            y_pred_val = X_val_fold.dot(theta)
            mse_scores[i, k] = np.mean((y_pred_val - y_val_fold) ** 2)

    mean_mse_scores = np.mean(mse_scores, axis=1)
    best_alpha_index = np.argmin(mean_mse_scores)
    best_alpha = alphas[best_alpha_index]

    return best_alpha

# Ridge Regression
def ridge_regression(X, y, lambda_=1.0):
    identity_matrix = np.eye(X.shape[1])
    identity_matrix[0, 0] = 0  # Don't regularize the intercept term
    return np.linalg.pinv(X.T.dot(X) + lambda_ * identity_matrix).dot(X.T).dot(y)

# RMSE Calculation
def calculate_rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

# R-squared Calculation
def calculate_r_squared(y_true, y_pred):
    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    return 1 - (ss_residual / ss_total)

# Try to train the model and catch potential errors
try:
    alphas = np.logspace(-6, 6, 10)
    best_alpha = ridge_regression_cv(X_train_scaled, y_train, alphas)
    theta = ridge_regression(X_train_scaled, y_train, best_alpha)
except np.linalg.LinAlgError as e:
    print(f"Error during training: {e}")
    theta = None

# If theta was computed successfully, predict and evaluate
if theta is not None:
    y_pred = X_test_scaled.dot(theta)
    mse = np.mean((y_pred - y_test) ** 2)
    rmse = calculate_rmse(y_test, y_pred)
    r_squared = calculate_r_squared(y_test, y_pred)

    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'R-squared: {r_squared}')
    print(f'Best Lambda: {best_alpha}')
    print(f'Coefficients (theta): {theta}')

Mean Squared Error: 185331753.1796174
Root Mean Squared Error: 13613.660535639097
R-squared: -9.29935215423068
Best Lambda: 46415.888336127726
Coefficients (theta): [ 1.43130971e+02  1.38099170e+01  1.73091688e+01  1.95393803e+01
  2.24332253e+00  2.17711294e+01  2.27611771e+01  1.45996073e+01
  2.70647556e+00  4.19474356e+00  2.14930840e+01 -2.34776279e+00
 -1.78014096e+01  9.54938071e-01  1.05994988e-01  9.27160134e-01
  2.73354329e-01  1.74480431e+00  2.95112668e+00  1.60946756e+00
  1.35540496e+00  5.96679549e-01  1.48015575e+00  2.21044901e+00
  6.20141254e+00  4.84519873e+00  3.14106796e+00  4.13611345e+00
  4.19648736e+00  3.45976737e+00  5.78542938e+00  5.09823899e+00
  5.91150596e+00 -2.04579064e+00 -1.71582130e+00 -1.61220561e+00
 -1.69756238e+00 -1.36858392e+00 -1.61916909e+00 -1.14362295e+00
 -4.42391606e-02 -1.30129639e+00 -9.79094645e-01 -1.73954001e+00
 -1.91322984e+00 -1.35541861e+00 -1.66399802e+00 -1.43421114e+00
 -2.07800785e+00 -6.50913845e-01 -9.53733304e-01 -1.429