## Q1

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split

### Step a: Load dataset

In [22]:
df = pd.read_csv(f"D:/Courses/Machine Learning/Class Assignment/USA_Housing.csv")
X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)

### Step b: Scale the input features

In [23]:
scaler = StandardScaler()     
X_scaled = scaler.fit_transform(X)

### Step c: 5-fold cross-validation setup

In [24]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
best_r2 = -np.inf    
r2_scores = []

### Step d: Perform 5-fold CV

In [25]:
for fold, (train_idx, test_idx) in enumerate(kf.split(X_scaled)):      #kf.split(X_scaled) generates 5 splits (since we used KFold(n_splits=5)).
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Add bias column of ones for intercept
    X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    # Compute beta using Least Squares: β = (XᵀX)^(-1) Xᵀy
    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ (X_train_bias.T @ y_train)
     # Predictions
    y_pred = X_test_bias @ beta

    # R2 score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    print(f"Fold {fold+1}: R2 Score = {r2:.4f}")

    # Track best beta
    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nAverage R2 Score across 5 folds:", np.mean(r2_scores))
print("Best R2 Score:", best_r2)

Fold 1: R2 Score = 0.9180
Fold 2: R2 Score = 0.9146
Fold 3: R2 Score = 0.9116
Fold 4: R2 Score = 0.9193
Fold 5: R2 Score = 0.9244

Average R2 Score across 5 folds: 0.9175745431092714
Best R2 Score: 0.9243869413350316


#### Step e: Train on 70% using best beta and test on 30%

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Train using best beta on 70% data
y_train_pred = X_train_bias @ best_beta
y_test_pred = X_test_bias @ best_beta

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Model Performance using Best Beta:")
print(f"Train R2 Score: {train_r2:.4f}")
print(f"Test R2 Score: {test_r2:.4f}")


Final Model Performance using Best Beta:
Train R2 Score: 0.9193
Test R2 Score: 0.9147


## Q2

In [4]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pandas as pd

# Load dataset
df = pd.read_csv("USA_Housing.csv")
X = df.drop("Price", axis=1).values
y = df["Price"].values

# Split dataset (56% train, 14% validation, 30% test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.44, random_state=42
)

# Now split temp into validation (14%) and test (30%)
val_size = 14 / (14 + 30)  # proportion of validation in remaining 44%
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=1 - val_size, random_state=42
)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Train with different learning rates
learning_rates = [0.001, 0.01, 0.1, 1]
results = {}

for lr in learning_rates:
    model = SGDRegressor(
        learning_rate="constant", eta0=lr, max_iter=1000, tol=None, random_state=42
    )
    model.fit(X_train, y_train)
    
    # Predictions
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    # R² scores
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)
    
    results[lr] = (r2_val, r2_test)

print("\n--- R² Scores for Different Learning Rates ---")
for lr, (r2_val, r2_test) in results.items():
    print(f"Learning Rate = {lr} | Validation R² = {r2_val:.4f} | Test R² = {r2_test:.4f}")

# Best learning rate based on validation score
best_lr = max(results, key=lambda k: results[k][0])
print(f"\nBest Learning Rate: {best_lr}")
print(f"Best Validation R²: {results[best_lr][0]:.4f}")
print(f"Test R² (with best model): {results[best_lr][1]:.4f}")


--- R² Scores for Different Learning Rates ---
Learning Rate = 0.001 | Validation R² = 0.9195 | Test R² = 0.9135
Learning Rate = 0.01 | Validation R² = 0.9161 | Test R² = 0.9105
Learning Rate = 0.1 | Validation R² = 0.9058 | Test R² = 0.8993
Learning Rate = 1 | Validation R² = -89112476993756.3906 | Test R² = -85051867046715.2500

Best Learning Rate: 0.001
Best Validation R²: 0.9195
Test R² (with best model): 0.9135


## Q3

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.decomposition import PCA

# Step 1: Load dataset
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore",
           "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, names=columns)
df
# Replace "?" with NaN
df = df.mask(df == "?", np.nan)
#df

# Step 2: Replace NaN with central tendency (mean for numeric, mode for categorical)
for col in df.columns:
    # try to treat column as numeric first (safe conversion)
    col_num = pd.to_numeric(df[col], errors='coerce')  # non-convertible -> NaN

    # if enough values converted to numeric, treat as numeric; otherwise categorical
    if col_num.notna().sum() >= (len(df) / 2):   # heuristic: majority numeric
        mean_val = col_num.mean()
        df[col] = col_num.fillna(mean_val)      # assign back numeric series
    else:
        # categorical: use mode if available, else leave NaN
        modes = df[col].mode(dropna=True)
        if not modes.empty:
            df[col] = df[col].fillna(modes[0])
        else:
            df[col] = df[col]  # nothing to do; leave as-is

# Drop rows with NaN in price
df = df[df["price"].notna()]
#df
# Step 3: Encoding categorical features
# (i) num_doors & num_cylinders: convert text to numbers
num_map = {
    "two": 2, "three": 3, "four": 4, "five": 5, "six": 6,
    "eight": 8, "twelve": 12
}
df["num_doors"] = df["num_doors"].replace(num_map).astype("Int64")   # nullable integer
df["num_cylinders"] = df["num_cylinders"].replace(num_map).astype("Int64")

# (ii) body_style & drive_wheels: Dummy Encoding
df = pd.get_dummies(df, columns=["body_style", "drive_wheels"], drop_first=True)

# (iii) make, aspiration, engine_location, fuel_type: Label Encoding
label_cols = ["make", "aspiration", "engine_location", "fuel_type"]
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

# (iv) fuel_system: pfi → 1, else 0
df["fuel_system"] = df["fuel_system"].apply(lambda x: 1 if "pfi" in x else 0)

# (v) engine_type: ohc → 1, else 0
df["engine_type"] = df["engine_type"].apply(lambda x: 1 if "ohc" in x else 0)

# Convert all numeric columns properly
df = df.apply(pd.to_numeric)

# Step 4: Divide features (X) and target (y)
X = df.drop("price", axis=1)
y = df["price"]

# Scale input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Train/Test split and Linear Regression
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print("Linear Regression Performance:")
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

# Step 6: PCA + Linear Regression
pca = PCA(n_components=0.95)  # retain 95% variance
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(
    X_pca, y, test_size=0.3, random_state=42
)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_pca)

y_pred_pca = lr_pca.predict(X_test_pca)
print("\nLinear Regression with PCA Performance:")
print("R2 Score:", r2_score(y_test_pca, y_pred_pca))
print("MSE:", mean_squared_error(y_test_pca, y_pred_pca))

Linear Regression Performance:
R2 Score: 0.8044422435762588
MSE: 13422229.591732565

Linear Regression with PCA Performance:
R2 Score: 0.7500675882701553
MSE: 17154268.253029242


  df["num_doors"] = df["num_doors"].replace(num_map).astype("Int64")   # nullable integer
  df["num_cylinders"] = df["num_cylinders"].replace(num_map).astype("Int64")
