# Problem Set 2
## Aakanksha Dutta & Aabha Pandit

# q3



In [10]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# Load dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
Y = data.target


features = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"]
X = X[features]

# 0-1 normalization
X = (X - X.min()) / (X.max() - X.min())
Y = (Y - Y.min()) / (Y.max() - Y.min())


X = X.to_numpy()
Y = Y.reshape(-1, 1)

# bias terms
X = np.c_[np.ones(X.shape[0]), X]


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=265)


def stochastic_gradient_descent(X, Y, learning_rate=0.01, epochs=1000):
    m, n = X.shape
    weights = np.zeros((n, 1))  # Initialize weights
    
    for epoch in range(epochs):
        for i in range(m):
            idx = np.random.randint(0, m)  
            x_i = X[idx].reshape(1, -1)
            y_i = Y[idx].reshape(1, -1)
            
            prediction = np.dot(x_i, weights)
            error = prediction - y_i
            
            # Update weights using SGD update rule
            weights -= learning_rate * x_i.T @ error
    
    return weights

In [11]:

# Train the model
learning_rate = 0.01
epochs = 1000
weights = stochastic_gradient_descent(X_train, Y_train, learning_rate, epochs)

# Predict function
def predict(X, weights):
    return np.dot(X, weights)

# Compute Mean Squared Error (MSE)
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

# Evaluate the model
y_train_pred = predict(X_train, weights)
y_test_pred = predict(X_test, weights)

mse_train = mse(Y_train, y_train_pred)
mse_test = mse(Y_test, y_test_pred)

print(f"Training MSE: {mse_train}")
print(f"Testing MSE: {mse_test}")
print(f"Learned Weights: {weights.ravel()}")


Training MSE: 0.02204339920941451
Testing MSE: 0.023050721944464954
Learned Weights: [ 0.73996337  1.30962198  0.10408374 -3.01055051  4.72020398 -0.06997644
 -0.82964972 -0.82005066 -0.91684462]


Important Factors:
* population (+4.7874) → strong positive effect.
* average bedrooms per dwelling (-2.9529) → more bedrooms seem to lower house prices, likely due to smaller rooms.
* house age (+1.3161) → older houses have higher prices, possibly due to location.
* median income (+0.7355) → wealthier areas have more expensive homes.
* latitude & longitude (-0.83, -0.82) → location matters significantly.

_training mse_ = 0.0220, _testing mse_ = 0.0231
* since the training and testing mse values are very close, this means our model generalizes well.
* the small mse values suggest that our model is making relatively accurate predictions.

# q4


In [12]:
from sklearn.linear_model import SGDRegressor

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=265)
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]

# Hyperparameters 
learning_rate = 0.01
num_epochs = 1000
n_samples, n_features = X_train.shape 
weights = np.random.randn(n_features, 1)

# Using Scikit-Learn's SGDRegressor
sgd_reg = SGDRegressor(loss='squared_error', learning_rate='constant', eta0=learning_rate, max_iter=num_epochs, random_state=265)
sgd_reg.fit(X_train, y_train.ravel())

# Compute MSE
train_mse_sklearn = np.mean((sgd_reg.predict(X_train) - y_train.ravel()) ** 2)
test_mse_sklearn = np.mean((sgd_reg.predict(X_test) - y_test.ravel()) ** 2)

print("\nSGDRegressor Results:")
print(f"Training MSE: {train_mse_sklearn}")
print(f"Testing MSE: {test_mse_sklearn}")
print(f"Learned Weights: {np.hstack((sgd_reg.intercept_.reshape(-1), sgd_reg.coef_))}")




SGDRegressor Results:
Training MSE: 0.022833223249611876
Testing MSE: 0.023289385971676832
Learned Weights: [ 0.25506976  0.23673588  0.23673588  1.13991477  0.11819594  0.07999486
  0.15911785 -0.01362413 -0.07618674 -0.77718429 -0.85884423]


Interpreting the Weights:
AveBedrms (4.787) has the strongest positive impact on house prices, meaning more bedrooms per household correlate with higher prices.
HouseAge (1.316) also has a strong positive effect, meaning older houses might be in established neighborhoods with higher demand.
Population (-2.952) has the strongest negative impact, indicating that densely populated areas tend to have lower house prices.
Longitude (-0.904) and Latitude (-0.820) have negative coefficients, suggesting a geographical pricing pattern (e.g., inland locations may be less expensive).

This is a very similar result as our custom Stochastic Gradient Descent. 

# Q 5


In [None]:
import pandas as pd
import numpy as np 
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import load_breast_cancer

In [None]:
# function to calcuate cov of each entry
def cov(a,b):
    a_mean = a.mean()
    b_mean = b.mean()
    n = len(a) #number of rows to divide by
    return sum((a - a_mean) * (b - b_mean))/n #no need to transpose since each Mij being calculated

In [None]:
# creating the matrix

def covMatix(data):
    #collect number of rows and cols in data
    rows, col = data.shape

    #shape is feature x feature so initilising zero matrix
    matrix = np.zeros((col,col))

    for i in range(col):
        for j in range(col):
            matrix[i][j] = cov(data[:,i], data[:,j])

    return matrix

In [None]:
# Load the California Housing dataset
housing = fetch_california_housing(as_frame=True)
housing_df = housing.frame #for .cov() to check
data = housing_df.values
print(housing_df.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [None]:
# Compute the covariance matrix
cov_matrix = covMatix(data)

# Print the covariance matrix
print("Variance-Covariance Matrix:")
print(cov_matrix)

Variance-Covariance Matrix:
[[ 3.60914769e+00 -2.84600238e+00  1.53649356e+00 -5.58548886e-02
   1.04004753e+01  3.70270956e-01 -3.23844062e-01 -5.77619034e-02
   1.50840174e+00]
 [-2.84600238e+00  1.58388586e+02 -4.77265121e+00 -4.63695945e-01
  -4.22206601e+03  1.72421442e+00  3.00330956e-01 -2.72811148e+00
   1.53391369e+00]
 [ 1.53649356e+00 -4.77265121e+00  6.12123614e+00  9.93819648e-01
  -2.02323909e+02 -1.24682825e-01  5.62208233e-01 -1.36511756e-01
   4.33804618e-01]
 [-5.58548886e-02 -4.63695945e-01  9.93819648e-01  2.24580619e-01
  -3.55255040e+01 -3.04227797e-02  7.05718662e-02  1.26698232e-02
  -2.55379820e-02]
 [ 1.04004753e+01 -4.22206601e+03 -2.02323909e+02 -3.55255040e+01
   1.28240832e+06  8.21672190e+02 -2.63125065e+02  2.26366871e+02
  -3.22109266e+01]
 [ 3.70270956e-01  1.72421442e+00 -1.24682825e-01 -3.04227797e-02
   8.21672190e+02  1.07864799e+02  5.24890984e-02  5.15162217e-02
  -2.84480199e-01]
 [-3.23844062e-01  3.00330956e-01  5.62208233e-01  7.05718662e-02


In [None]:
# checking using pandas cov function
covMatrix = housing_df.cov()
print(covMatrix)

                MedInc     HouseAge    AveRooms  AveBedrms    Population  \
MedInc        3.609323    -2.846140    1.536568  -0.055858  1.040098e+01   
HouseAge     -2.846140   158.396260   -4.772882  -0.463718 -4.222271e+03   
AveRooms      1.536568    -4.772882    6.121533   0.993868 -2.023337e+02   
AveBedrms    -0.055858    -0.463718    0.993868   0.224592 -3.552723e+01   
Population   10.400979 -4222.270582 -202.333712 -35.527225  1.282470e+06   
AveOccup      0.370289     1.724298   -0.124689  -0.030424  8.217120e+02   
Latitude     -0.323860     0.300346    0.562235   0.070575 -2.631378e+02   
Longitude    -0.057765    -2.728244   -0.136518   0.012670  2.263778e+02   
MedHouseVal   1.508475     1.533988    0.433826  -0.025539 -3.221249e+01   

               AveOccup    Latitude   Longitude  MedHouseVal  
MedInc         0.370289   -0.323860   -0.057765     1.508475  
HouseAge       1.724298    0.300346   -2.728244     1.533988  
AveRooms      -0.124689    0.562235   -0.136518   

#### Question 7

In [None]:
#sigmoid function
def sigmoid_f(z):
    return 1 / (1 + np.exp(-z))

In [None]:
#hypothesis
def classifer_f(X, theta):
    return sigmoid_f(np.dot(X, theta))

In [None]:
#entropy
def binary_loss(y, y_pred): #y_pred = classifer_f??
    m = len(y)
    return - (1 / m) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))


In [None]:
# stochastic gradient descent 
def gradient_f(X, y, theta, alpha, iterations):
    m = len(y)

    for i in range(iterations):
        y_pred = classifer_f(X, theta)
        gradient = np.dot(X.T, (y_pred - y)) / m
        theta -= alpha * gradient
    return theta

#### Question 8

In [None]:
# optimizer function

def optimizer_f(X, y, alpha, iterations):
    
    X = np.c_[np.ones(X.shape[0]), X] #col of 1s for intercept (for matrix multiplication)
    theta = np.zeros(X.shape[1]) #intialize theta vector
    
    # run gradient descent
    theta_opt = gradient_f(X, y, theta, alpha, iterations)
    
    return theta_opt, X, y

In [None]:
#running logit regression 

data = load_breast_cancer()
#print(data.data.shape), output was (569,30)

y = data.target 
X = data.data #other numerical features 

#normalising
y = (y - y.min()) / (y.max() - y.min())

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

#logit regression
theta_opt, X_norm, y_norm = optimizer_f(X, y, alpha=0.1, iterations=10000) #using max of 10000 iterations

#final equation
feature_names = ["Intercept"] + list(data.feature_names)
equation = " + ".join(f"{theta_opt[i]:.4f}*{feature_names[i]}" for i in range(len(theta_opt)))

#ranking coefficients from pos to neg
coef_ranking = sorted(zip(feature_names, theta_opt), key=lambda x: x[1], reverse=True)


print(f"\nFinal Logistic Regression Equation:\n{equation}\n")
print("Feature Coefficients (Ranked from Positive to Negative):")
for feature, coef in coef_ranking:
    print(f"{feature}: {coef:.4f}")



Final Logistic Regression Equation:
9.7400*Intercept + -1.2493*mean radius + -1.9910*mean texture + -1.3578*mean perimeter + -1.8667*mean area + 0.3557*mean smoothness + -0.4573*mean compactness + -2.9788*mean concavity + -4.1637*mean concave points + 0.3137*mean symmetry + 2.6421*mean fractal dimension + -2.4517*radius error + 0.3583*texture error + -1.8514*perimeter error + -1.7106*area error + 0.3479*smoothness error + 1.5651*compactness error + 0.7751*concavity error + 0.3495*concave points error + 0.8047*symmetry error + 1.3182*fractal dimension error + -3.0011*worst radius + -3.1849*worst texture + -2.7465*worst perimeter + -2.7538*worst area + -1.6942*worst smoothness + -1.0459*worst compactness + -2.0364*worst concavity + -3.9194*worst concave points + -1.6949*worst symmetry + -0.2300*worst fractal dimension

Feature Coefficients (Ranked from Positive to Negative):
Intercept: 9.7400
mean fractal dimension: 2.6421
compactness error: 1.5651
fractal dimension error: 1.3182
symmet

##### Final Logistic Regression Equation: 
9.7400*(Intercept) + -1.2493*(mean radius) + -1.9910*(mean texture) + -1.3578*(mean perimeter) + -1.8667*(mean area) + 0.3557*(mean smoothness) + -0.4573*(mean compactness) + -2.9788*(mean concavity) + -4.1637*(mean concave points) + 0.3137*(mean symmetry) + 2.6421*(mean fractal dimension) + -2.4517*(radius error) + 0.3583*(texture error) + -1.8514*(perimeter) error + -1.7106*(area error) + 0.3479*(smoothness error) + 1.5651*(compactness error) + 0.7751*(concavity error) + 0.3495*(concave points error) + 0.8047*(symmetry error) + 1.3182*(fractal dimension error) + -3.0011*(worst radius) + -3.1849*(worst texture) + -2.7465*(worst perimeter) + -2.7538*(worst area) + -1.6942*(worst smoothness) + -1.0459*(worst compactness) + -2.0364*(worst concavity) + -3.9194*(worst concave points) + -1.6949*(worst symmetry) + -0.2300*(worst fractal dimension)

#### Feature Coefficients (Ranked from Positive to Negative):

Intercept: 9.7400

mean fractal dimension: 2.6421

compactness error: 1.5651

fractal dimension error: 1.3182

symmetry error: 0.8047

concavity error: 0.7751

texture error: 0.3583

mean smoothness: 0.3557

concave points error: 0.3495

smoothness error: 0.3479

mean symmetry: 0.3137

worst fractal dimension: -0.2300

mean compactness: -0.4573

worst compactness: -1.0459

mean radius: -1.2493

mean perimeter: -1.3578

worst smoothness: -1.6942

worst symmetry: -1.6949

area error: -1.7106

perimeter error: -1.8514

mean area: -1.8667

mean texture: -1.9910

worst concavity: -2.0364

radius error: -2.4517

worst perimeter: -2.7465

worst area: -2.7538

mean concavity: -2.9788

worst radius: -3.0011

worst texture: -3.1849

worst concave points: -3.9194

mean concave points: -4.1637


# q10 


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut
from sklearn.preprocessing import KBinsDiscretizer
model = LinearRegression()
random_seed = 265
# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
kf_mse = cross_val_score(model, X, Y, cv=kf, scoring='neg_mean_squared_error')

# Leave-One-Out Cross Validation
loo = LeaveOneOut()
loo_mse = cross_val_score(model, X, Y, cv=loo, scoring='neg_mean_squared_error')


# Print MSE for each strategy
print(f'K-Fold CV MSE: {-kf_mse.mean()}')
print(f'Leave-One-Out CV MSE: {-loo_mse.mean()}')




K-Fold CV MSE: 0.022460523225369913
Leave-One-Out CV MSE: 0.022456875235560322


In [19]:
# train-test split cross-validation (repeat 10 times for stability)
from sklearn.metrics import mean_squared_error
num_splits = 10
test_size = 0.2

mse_scores = []

for _ in range(num_splits):
    # split data into training and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_seed)

    # train the model
    model.fit(X_train, Y_train)

    # predict on the test set
    Y_pred = model.predict(X_test)

    # calculate MSE
    tt_mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(tt_mse)

# average MSE across splits
train_test_split_mse = np.mean(mse_scores)
print(f'Train-Test Split (avg over {num_splits} runs) MSE: {train_test_split_mse:.6f}')


Train-Test Split (avg over 10 runs) MSE: 0.023056


In [21]:
model.fit(X, Y)
y_pred = model.predict(X)
final_mse = mean_squared_error(Y, y_pred)
print(f'Final model trained on full data MSE: {final_mse:.6f}')

Final model trained on full data MSE: 0.022290
