NAME : BALKARAN SINGH BATCH : 3Q11 ROLL NO : 102317009

ASSIGNMENT - 3

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
import requests

Q1. K-Fold Cross Validation for Multiple Linear Regression (Least Square Error Fit)

In [None]:
url1 = "https://drive.google.com/uc?export=download&id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX"
house_data = pd.read_csv(url1)

X = house_data.drop(columns=["Price"]).values
y = house_data["Price"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
best_r2 = -np.inf

fold = 1
for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    beta = np.linalg.inv(X_train.T @ X_train) @ (X_train.T @ y_train)

    y_pred = X_test @ beta
    r2 = r2_score(y_test, y_pred)

    print(f"Fold {fold} R2 Score: {r2:.4f}")
    fold += 1

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nBest R2 Score from CV:", best_r2)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
beta_final = np.linalg.inv(X_train.T @ X_train) @ (X_train.T @ y_train)

y_pred_final = X_test @ beta_final
print("Final R2 Score (70/30 split):", r2_score(y_test, y_pred_final))

Fold 1 R2 Score: -11.4420
Fold 2 R2 Score: -11.8285
Fold 3 R2 Score: -11.6144
Fold 4 R2 Score: -10.9997
Fold 5 R2 Score: -10.5798

Best R2 Score from CV: -10.579758582041713
Final R2 Score (70/30 split): -12.003903385562284


Q2. Concept of Validation set for Multiple Linear Regression (Gradient Descent
Optimization)

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

def gradient_descent(X, y, lr, iterations=1000):
    m, n = X.shape
    beta = np.zeros(n)
    for _ in range(iterations):
        gradient = -(2/m) * (X.T @ (y - X @ beta))
        beta -= lr * gradient
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
best_beta = None
best_val_r2 = -np.inf

for lr in learning_rates:
    beta = gradient_descent(X_train, y_train, lr=lr, iterations=1000)
    y_val_pred = X_val @ beta
    y_test_pred = X_test @ beta

    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)

    print(f"\nLearning Rate {lr}:")
    print("Validation R2:", r2_val)
    print("Test R2:", r2_test)

    if r2_val > best_val_r2:
        best_val_r2 = r2_val
        best_beta = beta

print("\nBest Beta (from validation) coefficients:\n", best_beta)


Learning Rate 0.001:
Validation R2: -11.309840188168787
Test R2: -12.05506738296265

Learning Rate 0.01:
Validation R2: -11.318069140536533
Test R2: -12.010979529759709

Learning Rate 0.1:
Validation R2: -11.31806970348623
Test R2: -12.010979232167713

Learning Rate 1:
Validation R2: -inf
Test R2: -inf

Best Beta (from validation) coefficients:
 [196011.02066971 138171.30668778 117004.69583016  39664.25921488
 116553.34299286]


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


Q3. Pre-processing and Multiple Linear Regression

In [None]:
url2 = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors",
           "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width",
           "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg",
           "highway_mpg", "price"]

car_data = pd.read_csv(url2, names=columns)

car_data = car_data.replace("?", np.nan)
car_data = car_data.dropna(subset=["price"])

numeric_cols = ["symboling","normalized_losses","wheel_base","length","width","height",
                "curb_weight","engine_size","bore","stroke","compression_ratio","horsepower",
                "peak_rpm","city_mpg","highway_mpg","price"]

for col in numeric_cols:
    car_data[col] = pd.to_numeric(car_data[col], errors="coerce")

car_data[numeric_cols] = car_data[numeric_cols].fillna(car_data[numeric_cols].median())

car_data["num_doors"] = car_data["num_doors"].replace({"two": 2, "four": 4}).astype(float)
car_data["num_cylinders"] = car_data["num_cylinders"].replace(
    {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6,
     "eight": 8, "twelve": 12}).astype(float)

car_data["num_doors"].fillna(car_data["num_doors"].median(), inplace=True)
car_data["num_cylinders"].fillna(car_data["num_cylinders"].median(), inplace=True)

car_data = pd.get_dummies(car_data, columns=["body_style", "drive_wheels"], drop_first=True)

label_cols = ["make", "aspiration", "engine_location", "fuel_type"]
for col in label_cols:
    le = LabelEncoder()
    car_data[col] = le.fit_transform(car_data[col].astype(str))

car_data["fuel_system"] = car_data["fuel_system"].apply(lambda x: 1 if pd.notna(x) and "pfi" in str(x) else 0)
car_data["engine_type"] = car_data["engine_type"].apply(lambda x: 1 if pd.notna(x) and "ohc" in str(x) else 0)

X_car = car_data.drop(columns=["price"])
y_car = car_data["price"].astype(float)

scaler = StandardScaler()
X_car_scaled = scaler.fit_transform(X_car)

X_train, X_test, y_train, y_test = train_test_split(X_car_scaled, y_car, test_size=0.3, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("\nCar Price Prediction R2 Score:", r2_score(y_test, y_pred))

pca = PCA(n_components=10)
X_car_pca = pca.fit_transform(X_car_scaled)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_car_pca, y_car, test_size=0.3, random_state=42)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_pca)
y_pred_pca = lr_pca.predict(X_test_pca)
print("Car Price Prediction with PCA R2 Score:", r2_score(y_test_pca, y_pred_pca))



Car Price Prediction R2 Score: 0.8734104772978124
Car Price Prediction with PCA R2 Score: 0.8436522203644288


  car_data["num_doors"] = car_data["num_doors"].replace({"two": 2, "four": 4}).astype(float)
  car_data["num_cylinders"] = car_data["num_cylinders"].replace(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_data["num_doors"].fillna(car_data["num_doors"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


 