In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv("USA_Housing.csv")

# Split into input features and output variable
X = data.drop("Price", axis=1).values
y = data["Price"].values

# Scale karengee input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Function to calculate beta using Least Squares
def least_squares_beta(X_train, y_train):
    # Beta = (X^T * X)^-1 * X^T * y
    return np.linalg.inv(X_train.T @ X_train) @ X_train.T @ y_train

# 5 fold cross validation
k = 5
n = X_scaled.shape[0]
fold_size = n // k
r2_scores = []
betas = []

for i in range(k):
    start = i * fold_size
    end = start + fold_size if i != k-1 else n

    X_test = X_scaled[start:end]
    y_test = y[start:end]

    X_train = np.concatenate((X_scaled[:start], X_scaled[end:]), axis=0)
    y_train = np.concatenate((y[:start], y[end:]), axis=0)

    beta = least_squares_beta(X_train, y_train)
    y_pred = X_test @ beta
    score = r2_score(y_test, y_pred)

    betas.append(beta)
    r2_scores.append(score)

# best beta on maximum R2 score
best_index = np.argmax(r2_scores)
best_beta = betas[best_index]
print("R2 Scores for each fold:", r2_scores)
print("Best Beta matrix:", best_beta)

# Train on 70% of data using best beta
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)
# Predicted values using best beta
y_pred_final = X_test_final @ best_beta
r2_final = r2_score(y_test_final, y_pred_final)
print("R2 Score on 30% test data:", r2_final)


R2 Scores for each fold: [-11.566262890130577, -10.327833022754156, -12.218871237545434, -10.951296474620897, -11.593510618998712]
Best Beta matrix: [231008.84527364 174703.35249181 121834.54293259  -2872.53491108
 152806.89864888]
R2 Score on 30% test data: -11.983806002911246


In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load karo dataset
data = pd.read_csv("USA_Housing.csv")

# Split into input features/  output variable
X = data.drop("Price", axis=1).values
y = data["Price"].values.reshape(-1, 1)

# Scale input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Add bias term to X
X_scaled = np.hstack((np.ones((X_scaled.shape[0], 1)), X_scaled))

# Split data: 56% train 14 validation 30 test
n = X_scaled.shape[0]
train_end = int(0.56 * n)
val_end = int(0.70 * n)

X_train, y_train = X_scaled[:train_end], y[:train_end]
X_val, y_val = X_scaled[train_end:val_end], y[train_end:val_end]
X_test, y_test = X_scaled[val_end:], y[val_end:]

# Gradient Descent
def gradient_descent(X, y, lr, iterations):
    m, n = X.shape
    beta = np.zeros((n, 1))
    for i in range(iterations):
        gradients = (1/m) * X.T @ (X @ beta - y)
        beta = beta - lr * gradients
    return beta

# Learning rates to test
learning_rates = [0.001, 0.01, 0.1, 1]
iterations = 1000

best_beta = None
best_r2 = -np.inf

# Train for learning rate and check validation R2
for lr in learning_rates:
    beta = gradient_descent(X_train, y_train, lr, iterations)
    y_val_pred = X_val @ beta
    r2 = r2_score(y_val, y_val_pred)
    print(f"Learning Rate: {lr}, Validation R2: {r2}")
    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

# Evaluate best beta on test set
y_test_pred = X_test @ best_beta
r2_test = r2_score(y_test, y_test_pred)
print("Best Beta coefficients:\n", best_beta)
print("R2 Score on Test Set:", r2_test)


Learning Rate: 0.001, Validation R2: -0.9353469873109566
Learning Rate: 0.01, Validation R2: 0.9150931093041854
Learning Rate: 0.1, Validation R2: 0.9151040123364315
Learning Rate: 1, Validation R2: 0.9151040123364313
Best Beta coefficients:
 [[ 1.23244775e+06]
 [ 2.31682635e+05]
 [ 1.63635272e+05]
 [ 1.19025219e+05]
 [-2.74956842e+02]
 [ 1.50705906e+05]]
R2 Score on Test Set: 0.917477081644098


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

# Load dataset with column names
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration","num_doors",
           "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width",
           "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg",
           "highway_mpg", "price"]

data = pd.read_csv("imports-85.data.txt", names=columns, na_values="?")

# Central tendency
for col in data.columns:
    if data[col].dtype == "object":
        data[col] = data[col].fillna(data[col].mode()[0])
    else:
        data[col] = data[col].fillna(data[col].median())

# Drop rows with NaN in price
data = data.dropna(subset=['price'])
data['price'] = data['price'].astype(float)

# Convert word numbers to numeric
num_map = {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "eight": 8, "twelve": 12}
data['num_doors'] = data['num_doors'].map(num_map)
data['num_cylinders'] = data['num_cylinders'].map(num_map)

# Dummy encoding for body_style and drive_wheels
data = pd.get_dummies(data, columns=['body_style', 'drive_wheels'], drop_first=True)

# Label encoding for make, aspiration, engine_location, fuel_type
le = LabelEncoder()
for col in ['make', 'aspiration', 'engine_location', 'fuel_type']:
    data[col] = le.fit_transform(data[col])

# Custom encoding for fuel_system and engine_type
data['fuel_system'] = data['fuel_system'].apply(lambda x: 1 if 'pfi' in x.lower() else 0)
data['engine_type'] = data['engine_type'].apply(lambda x: 1 if 'ohc' in x.lower() else 0)

# Split karo features and target
X = data.drop('price', axis=1).values
y = data['price'].values

# Scalingggg the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split 70% train/ 30% test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Linear Regression mein train
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("R2 Score without PCA:", r2_score(y_test, y_pred))

# Apply karna hai PCA dimensionality reduction
pca = PCA(n_components=0.95)  # preserve 95% variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train Linear Regression on reduced features
lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
y_pred_pca = lr_pca.predict(X_test_pca)
print("R2 Score with PCA:", r2_score(y_test, y_pred_pca))


R2 Score without PCA: 0.7962231220908701
R2 Score with PCA: 0.751700022926359
