                                                           ML LAB ASSIGNMENT 3
                                                   SUBMITTED BY : ABHAYJEET(102303761)

Q1

In [None]:
# Q1: K-Fold Cross Validation (Least Squares)
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split


df = pd.read_csv("USA_Housing.csv")   

X = df.drop("Price", axis=1).values
y = df["Price"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2_scores = []
betas = []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    X_train_b = np.c_[np.ones(X_train.shape[0]), X_train]
    X_test_b = np.c_[np.ones(X_test.shape[0]), X_test]
    
    beta = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
    y_pred = X_test_b @ beta
    
    score = r2_score(y_test, y_pred)
    
    r2_scores.append(score)
    betas.append(beta)

best_idx = np.argmax(r2_scores)
best_beta = betas[best_idx]

print("R2 scores from folds:", r2_scores)
print("Best R2 score:", r2_scores[best_idx])

# Training on 70% with best beta and test on 30%
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train_b = np.c_[np.ones(X_train.shape[0]), X_train]
X_test_b = np.c_[np.ones(X_test.shape[0]), X_test]

beta_final = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
y_pred_final = X_test_b @ beta_final
print("Final R2 score on 30% test data:", r2_score(y_test, y_pred_final))


R2 scores from folds: [0.9179971706985147, 0.9145677884802819, 0.9116116385364478, 0.9193091764960816, 0.9243869413350316]
Best R2 score: 0.9243869413350316
Final R2 score on 30% test data: 0.9146818498916266


Q2

In [4]:
# (Gradient Descent Optimization)
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


df = pd.read_csv("USA_Housing.csv")

X = df.drop("Price", axis=1).values
y = df["Price"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Spliting into 56% train, 14% validation, 30% test
X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=42)  # 0.20 of 70% = 14%

# Adding bias
X_train_b = np.c_[np.ones(X_train.shape[0]), X_train]
X_val_b = np.c_[np.ones(X_val.shape[0]), X_val]
X_test_b = np.c_[np.ones(X_test.shape[0]), X_test]

def gradient_descent(X, y, lr, iters=1000):
    m, n = X.shape
    beta = np.zeros(n)
    for _ in range(iters):
        grad = (1/m) * X.T @ (X @ beta - y)
        beta -= lr * grad
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
results = []

for lr in learning_rates:
    beta = gradient_descent(X_train_b, y_train, lr=lr, iters=1000)
    
    y_val_pred = X_val_b @ beta
    y_test_pred = X_test_b @ beta
    
    r2_val = r2_score(y_val, y_val_pred)
    r2_tst = r2_score(y_test, y_test_pred)
    
    results.append((lr, beta, r2_val, r2_tst))

for lr, beta, r2_val, r2_tst in results:
    print(f"LR={lr}, Validation R2={r2_val:.4f}, Test R2={r2_tst:.4f}")

best = max(results, key=lambda x: x[2])
print("\nBest learning rate:", best[0])
print("Best beta coefficients:", best[1][:5], "...")  # showing first 5


LR=0.001, Validation R2=-0.8125, Test R2=-0.9914
LR=0.01, Validation R2=0.9098, Test R2=0.9147
LR=0.1, Validation R2=0.9098, Test R2=0.9148
LR=1, Validation R2=0.9098, Test R2=0.9148

Best learning rate: 0.01
Best beta coefficients: [1232562.51254919  230048.76664688  163686.93503606  121406.94107918
    3117.47363933] ...


In [None]:
# Q3: Pre-processing and Multiple Linear Regression

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# ---------------- Step 1: Load dataset ----------------
cols = ["symboling","normalized_losses","make","fuel_type","aspiration","num_doors",
        "body_style","drive_wheels","engine_location","wheel_base","length","width",
        "height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system",
        "bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg",
        "highway_mpg","price"]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, names=cols)

# Replacing  "?" with NaN
df.replace("?", np.nan, inplace=True)


df = df.dropna(subset=["price"])

# Converting  numeric columns to numeric type
numeric_cols = ["symboling","normalized_losses","wheel_base","length","width","height",
                "curb_weight","engine_size","bore","stroke","compression_ratio",
                "horsepower","peak_rpm","city_mpg","highway_mpg","price"]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")


imputer = SimpleImputer(strategy="mean")
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

word_to_num = {
    "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "eight": 8, "twelve": 12
}
df["num_doors"] = df["num_doors"].map(word_to_num)
df["num_cylinders"] = df["num_cylinders"].map(word_to_num)


df[["num_doors","num_cylinders"]] = df[["num_doors","num_cylinders"]].fillna(df[["num_doors","num_cylinders"]].mode().iloc[0])


df = pd.get_dummies(df, columns=["body_style","drive_wheels"], drop_first=True)


label_cols = ["make","aspiration","engine_location","fuel_type"]
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

df["fuel_system"] = df["fuel_system"].apply(lambda x: 1 if isinstance(x, str) and "pfi" in x else 0)

df["engine_type"] = df["engine_type"].apply(lambda x: 1 if isinstance(x, str) and "ohc" in x else 0)

X = df.drop("price", axis=1)
y = df["price"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

reg = LinearRegression()
reg.fit(X_train, y_train)
print("R² Score without PCA:", reg.score(X_test, y_test))

pca = PCA(n_components=10) 
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

reg_pca = LinearRegression()
reg_pca.fit(X_train_pca, y_train_pca)
print("R² Score with PCA:", reg_pca.score(X_test_pca, y_test_pca))


R² Score without PCA: 0.8732775682086301
R² Score with PCA: 0.8418180342766173
