In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [None]:
df = pd.read_csv("USA_Housing.csv") 
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [None]:
# Q1. 
from sklearn.preprocessing import StandardScaler
X = df.drop(columns=['Price']).values   
y = df['Price'].values.reshape(-1, 1)  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [20]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
betas = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    XTX_inv = np.linalg.inv(X_train.T @ X_train)
    beta = XTX_inv @ X_train.T @ y_train
    
    y_pred = X_test @ beta
    
    r2 = r2_score(y_test, y_pred)
    
    r2_scores.append(r2)
    betas.append(beta)

print("R² Scores:", np.round(r2_scores, 3))
print("Average R²:", np.mean(r2_scores))

R² Scores: [-11.442 -11.828 -11.614 -11.    -10.58 ]
Average R²: -11.292840001228193


In [24]:
best_beta = betas[np.argmax(r2_scores)]
split = int(0.7 * len(X_scaled))

X_train, X_test = X_scaled[:split], X_scaled[split:]
y_train, y_test = y[:split], y[split:]

y_pred_final = X_test @ best_beta
final_r2 = r2_score(y_test, y_pred_final)

print("Best R² (cross-validation):", np.max(r2_scores))
print("Final R² (70/30 test):", round(final_r2, 3))


Best R² (cross-validation): -10.579758582041713
Final R² (70/30 test): -11.281


In [27]:
# Q2.
from sklearn.model_selection import train_test_split

X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=42)

In [29]:
def gradient_descent(X, y, alpha, iterations=1000):
    m, n = X.shape
    beta = np.zeros((n, 1))
    for _ in range(iterations):
        gradient = (1/m) * (X.T @ (X @ beta - y))
        beta -= alpha * gradient
    return beta

In [35]:
learning_rates = [0.001, 0.01, 0.1, 1]
results = []

for lr in learning_rates:
    beta = gradient_descent(X_train, y_train, alpha=lr)
    
    y_val_pred = X_val @ beta
    y_test_pred = X_test @ beta
    
    val_r2 = r2_score(y_val, y_val_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    results.append((lr, val_r2, test_r2))

results_df = pd.DataFrame(results, columns=['Learning Rate', 'R2_Validation', 'R2_Test'])
print(results_df)

   Learning Rate  R2_Validation    R2_Test
0          0.001     -11.339345 -12.187213
1          0.010     -11.317942 -12.011057
2          0.100     -11.318070 -12.010979
3          1.000     -11.318070 -12.010979


In [39]:
# Q3.
cols = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", "body_style",
        "drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight",
        "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
        "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

df = pd.read_csv("imports-85.data.txt", names=cols)
df.replace("?", np.nan, inplace=True)


In [41]:
df = df.apply(pd.to_numeric, errors='ignore')
df = df.dropna(subset=['price'])
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

  df = df.apply(pd.to_numeric, errors='ignore')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the in

In [None]:
df['num_doors'] = df['num_doors'].replace({'two': 2, 'four': 4})
df['num_cylinders'] = df['num_cylinders'].replace(
    {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12})

df = pd.get_dummies(df, columns=['body_style', 'drive_wheels'], drop_first=True)

from sklearn.preprocessing import LabelEncoder
for col in ['make', 'aspiration', 'engine_location', 'fuel_type']:
    df[col] = LabelEncoder().fit_transform(df[col])

df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x) else 0)
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in str(x) else 0)


KeyError: "None of [Index(['body_style', 'drive_wheels'], dtype='object')] are in the [columns]"

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X = df.drop(columns=['price'])
y = df['price']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("R² (Original):", r2_score(y_test, y_pred))


R² (Original): 0.8734104772978123


In [57]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)  
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
y_pred_pca = lr_pca.predict(X_test_pca)

print("R² (After PCA):", r2_score(y_test, y_pred_pca))


R² (After PCA): 0.8617116799738689
