<a href="https://colab.research.google.com/github/arpit-devop/machine-learning-assignment/blob/main/ml3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Q1: USA House Price Prediction - 5-fold Cross Validation with Least Squares
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load the dataset

data = pd.read_csv('/content/USA_Housing.csv')

# Separate input features and output variable
X = data.drop('Price', axis=1).values
y = data['Price'].values

# Scale input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
betas = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Compute least squares beta
    X_train_ = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    beta = np.linalg.inv(X_train_.T @ X_train_) @ X_train_.T @ y_train
    X_test_ = np.hstack([np.ones((X_test.shape[0], 1)), X_test])
    y_pred = X_test_ @ beta
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    betas.append(beta)

best_idx = np.argmax(r2_scores)
best_beta = betas[best_idx]

# Train/Test Split (70/30) using best beta
X_tr, X_te, y_tr, y_te = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_tr_ = np.hstack([np.ones((X_tr.shape[0], 1)), X_tr])
X_te_ = np.hstack([np.ones((X_te.shape[0], 1)), X_te])
y_tr_pred = X_tr_ @ best_beta
y_te_pred = X_te_ @ best_beta
r2_train = r2_score(y_tr, y_tr_pred)
r2_test = r2_score(y_te, y_te_pred)

print("Best R2 (CV):", r2_scores[best_idx])
print("Train R2:", r2_train, "Test R2:", r2_test)

Best R2 (CV): 0.9243869413350316
Train R2: 0.9192672043633922 Test R2: 0.9147458156636434


In [None]:
# Q2: Train/Val/Test split and Gradient Descent for Multiple Linear Regression
from sklearn.model_selection import train_test_split

# Train/Val/Test split: 56% train, 14% validation, 30% test
X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
val_size = 0.14 / (0.56 + 0.14)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_size, random_state=42)

def gradient_descent(X, y, lr=0.01, n_iters=1000):
    n_samples, n_features = X.shape
    beta = np.zeros(n_features)
    for _ in range(n_iters):
        y_pred = X @ beta
        grad = -2/n_samples * (X.T @ (y - y_pred))
        beta = beta - lr * grad
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
r2_val_scores, r2_test_scores, betas = [], [], []

X_train_ = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_val_ = np.hstack([np.ones((X_val.shape[0], 1)), X_val])
X_test_ = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

for lr in learning_rates:
    beta = gradient_descent(X_train_, y_train, lr, 1000)
    betas.append(beta)
    y_val_pred = X_val_ @ beta
    y_test_pred = X_test_ @ beta
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)
    r2_val_scores.append(r2_val)
    r2_test_scores.append(r2_test)

best_idx = np.argmax(r2_val_scores)
best_beta = betas[best_idx]

print(f"Best learning rate: {learning_rates[best_idx]}")
print(f"Validation R2: {r2_val_scores[best_idx]}, Test R2: {r2_test_scores[best_idx]}")

Best learning rate: 0.01
Validation R2: 0.909799626728122, Test R2: 0.9147569598865972


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


In [None]:
# Q3: Car Price Prediction - Preprocessing and Multiple Linear Regression

# Column names for the dataset
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
           "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
           "engine_size", "fuel_system", "bore", "stroke", "compression_ratio", "horsepower",
           "peak_rpm", "city_mpg", "highway_mpg", "price"]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
car_df = pd.read_csv(url, names=columns, na_values="?")

# Imputation
for col in car_df.columns:
    if car_df[col].dtype == 'object':
        car_df[col] = car_df[col].fillna(car_df[col].mode()[0])
    else:
        car_df[col] = car_df[col].fillna(car_df[col].median())

car_df.dropna(subset=['price'], inplace=True)

# Convert non-numeric values
num_map = {'four':4,'two':2,'three':3,'five':5,'six':6,'eight':8,'twelve':12}
car_df['num_doors'] = car_df['num_doors'].replace(num_map)
car_df['num_cylinders'] = car_df['num_cylinders'].replace(num_map)


car_df = pd.get_dummies(car_df, columns=["body_style", "drive_wheels"])

# Convert boolean columns to int
bool_cols = car_df.select_dtypes(include='bool').columns
for col in bool_cols:
    car_df[col] = car_df[col].astype(int)

from sklearn.preprocessing import LabelEncoder
for col in ["make", "aspiration", "engine_location","fuel_type"]:
    car_df[col] = LabelEncoder().fit_transform(car_df[col])
car_df['fuel_system'] = car_df['fuel_system'].apply(lambda x: 1 if "pfi" in str(x) else 0)
car_df['engine_type'] = car_df['engine_type'].apply(lambda x: 1 if "ohc" in str(x) else 0)

# Explicitly convert all columns to numeric before creating NumPy array X
print("\nDataFrame dtypes before converting to NumPy array:")
print(car_df.dtypes)

# Convert DataFrame to float type before creating NumPy array
car_df = car_df.astype(float)

# Check for NaNs in DataFrame after converting to float
print("\nMissing values in DataFrame after converting to float:")
print(car_df.isnull().sum())

# Separate features and target, scale inputs
X = car_df.drop('price', axis=1).values
y = car_df['price'].values



X_scaled = StandardScaler().fit_transform(X)

# Train/test split (70/30)
X_tr, X_te, y_tr, y_te = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Linear Regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_tr, y_tr)
y_pred_te = reg.predict(X_te)
r2_full = r2_score(y_te, y_pred_te)

# PCA + Regression
from sklearn.decomposition import PCA
pca = PCA(n_components=min(X_tr.shape[1], 10))
X_tr_pca = pca.fit_transform(X_tr)
X_te_pca = pca.transform(X_te)
reg_pca = LinearRegression()
reg_pca.fit(X_tr_pca, y_tr)
y_pred_te_pca = reg_pca.predict(X_te_pca)
r2_pca = r2_score(y_te, y_pred_te_pca)

print("Normal Linear Regression Test R2:", r2_full)
print("PCA-based Regression Test R2:", r2_pca)


DataFrame dtypes before converting to NumPy array:
symboling                   int64
normalized_losses         float64
make                        int64
fuel_type                   int64
aspiration                  int64
num_doors                   int64
engine_location             int64
wheel_base                float64
length                    float64
width                     float64
height                    float64
curb_weight                 int64
engine_type                 int64
num_cylinders               int64
engine_size                 int64
fuel_system                 int64
bore                      float64
stroke                    float64
compression_ratio         float64
horsepower                float64
peak_rpm                  float64
city_mpg                    int64
highway_mpg                 int64
price                     float64
body_style_convertible      int64
body_style_hardtop          int64
body_style_hatchback        int64
body_style_sedan            in

  car_df['num_doors'] = car_df['num_doors'].replace(num_map)
  car_df['num_cylinders'] = car_df['num_cylinders'].replace(num_map)
