# Lab Assignment 3: Machine Learning (UML501)
This notebook contains solutions for all the questions of the lab assignment.

## Q1: K-Fold Cross Validation for Multiple Linear Regression (Least Square Error Fit)

In [11]:

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load dataset from file (USA_Housing.csv)
url = "USA_Housing.csv"
df = pd.read_csv(url)

# Split features and target
X = df.drop('Price', axis=1).values
y = df['Price'].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
betas = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Least squares solution: beta = (X^T X)^-1 X^T y
    X_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    beta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y_train
    
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_pred = X_test_b @ beta
    
    score = r2_score(y_test, y_pred)
    r2_scores.append(score)
    betas.append(beta)

# Best beta
best_idx = np.argmax(r2_scores)
best_beta = betas[best_idx]

print("R2 Scores for each fold:", r2_scores)
print("Best Beta (max R2):", best_beta)


R2 Scores for each fold: [0.9179971706985147, 0.9145677884802819, 0.9116116385364478, 0.9193091764960816, 0.9243869413350316]
Best Beta (max R2): [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]


In [12]:

# Train with 70% data and test with 30%
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]

y_pred = X_test_b @ best_beta
print("Final R2 score using best beta:", r2_score(y_test, y_pred))


Final R2 score using best beta: 0.9147458156636434


## Q2: Concept of Validation set for Multiple Linear Regression (Gradient Descent Optimization)

In [13]:

# Split into training (56%), validation (14%), test (30%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

def gradient_descent(X, y, lr, n_iter=1000):
    m, n = X.shape
    X_b = np.c_[np.ones((m, 1)), X]
    beta = np.zeros(X_b.shape[1])
    
    for _ in range(n_iter):
        gradient = -2/m * X_b.T @ (y - X_b @ beta)
        beta -= lr * gradient
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
results = {}

for lr in learning_rates:
    beta = gradient_descent(X_train, y_train, lr)
    
    X_val_b = np.c_[np.ones((X_val.shape[0], 1)), X_val]
    y_val_pred = X_val_b @ beta
    val_score = r2_score(y_val, y_val_pred)
    
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_test_pred = X_test_b @ beta
    test_score = r2_score(y_test, y_test_pred)
    
    results[lr] = {"beta": beta, "val_r2": val_score, "test_r2": test_score}

results


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


{0.001: {'beta': array([1065444.29940522,  198687.06076461,  140098.69079504,
          103050.27041846,   25950.4759087 ,  125942.09158332]),
  'val_r2': 0.6820187423659017,
  'test_r2': 0.6490453443347961},
 0.01: {'beta': array([1232618.31836202,  230067.95333238,  163710.26584918,
          121680.22876975,    2833.37135223,  150657.57448494]),
  'val_r2': 0.909799626728122,
  'test_r2': 0.9147569598865972},
 0.1: {'beta': array([1232618.32011841,  230067.9889464 ,  163710.33259401,
          121681.42752283,    2832.15066521,  150657.52262836]),
  'val_r2': 0.9097995626742027,
  'test_r2': 0.9147570103083724},
 1: {'beta': array([-1.27984491e+284, -7.93780641e+283, -1.89878064e+283,
         -1.21190221e+285, -1.19690539e+285,  7.62932836e+283]),
  'val_r2': -inf,
  'test_r2': -inf}}

## Q3: Pre-processing and Multiple Linear Regression

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load dataset directly from car_details.csv file
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration","num_doors",
           "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width",
           "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg",
           "highway_mpg", "price"]
df = pd.read_csv('car_details.csv', names=columns)

# Replace ? with NaN
df.replace("?", np.nan, inplace=True)

# Impute missing values (central tendency)
for col in df.columns:
    if df[col].dtype == "object":
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col] = df[col].astype(float)
        df[col].fillna(df[col].median(), inplace=True)

# Drop rows with NaN in price
df = df.dropna(subset=["price"])
df["price"] = df["price"].astype(float)

# Preprocessing for categorical variables
num_map = {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "eight": 8, "twelve": 12}
df["num_doors"].replace({"two": 2, "four": 4}, inplace=True)
df["num_cylinders"].replace(num_map, inplace=True)

# Dummy encoding
df = pd.get_dummies(df, columns=["body_style", "drive_wheels"], drop_first=True)

# Label encoding
for col in ["make", "aspiration", "engine_location", "fuel_type"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Binary encoding
df["fuel_system"] = df["fuel_system"].apply(lambda x: 1 if "pfi" in x else 0)
df["engine_type"] = df["engine_type"].apply(lambda x: 1 if "ohc" in x else 0)

# Split into features and target
X = df.drop("price", axis=1).values
y = df["price"].values

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("R2 score without PCA:", r2_score(y_test, model.predict(X_test)))

# PCA decomposition
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_scaled)
X_train_r, X_test_r, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42)
model_r = LinearRegression()
model_r.fit(X_train_r, y_train)
print("R2 score with PCA:", r2_score(y_test, model_r.predict(X_test_r)))

R2 score without PCA: 0.789504557673385
R2 score with PCA: 0.7478420860380317


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value