#### Question 5

In [1]:
import pandas as pd
import numpy as np 
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import load_breast_cancer

In [2]:
# function to calcuate cov of each entry
def cov(a,b):
    a_mean = a.mean()
    b_mean = b.mean()
    n = len(a) #number of rows to divide by
    return sum((a - a_mean) * (b - b_mean))/n #no need to transpose since each Mij being calculated

In [3]:
# creating the matrix

def covMatix(data):
    #collect number of rows and cols in data
    rows, col = data.shape

    #shape is feature x feature so initilising zero matrix
    matrix = np.zeros((col,col))

    for i in range(col):
        for j in range(col):
            matrix[i][j] = cov(data[:,i], data[:,j])

    return matrix

In [4]:
# Load the California Housing dataset
housing = fetch_california_housing(as_frame=True)
housing_df = housing.frame #for .cov() to check
data = housing_df.values
print(housing_df.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [5]:
# Compute the covariance matrix
cov_matrix = covMatix(data)

# Print the covariance matrix
print("Variance-Covariance Matrix:")
print(cov_matrix)

Variance-Covariance Matrix:
[[ 3.60914769e+00 -2.84600238e+00  1.53649356e+00 -5.58548886e-02
   1.04004753e+01  3.70270956e-01 -3.23844062e-01 -5.77619034e-02
   1.50840174e+00]
 [-2.84600238e+00  1.58388586e+02 -4.77265121e+00 -4.63695945e-01
  -4.22206601e+03  1.72421442e+00  3.00330956e-01 -2.72811148e+00
   1.53391369e+00]
 [ 1.53649356e+00 -4.77265121e+00  6.12123614e+00  9.93819648e-01
  -2.02323909e+02 -1.24682825e-01  5.62208233e-01 -1.36511756e-01
   4.33804618e-01]
 [-5.58548886e-02 -4.63695945e-01  9.93819648e-01  2.24580619e-01
  -3.55255040e+01 -3.04227797e-02  7.05718662e-02  1.26698232e-02
  -2.55379820e-02]
 [ 1.04004753e+01 -4.22206601e+03 -2.02323909e+02 -3.55255040e+01
   1.28240832e+06  8.21672190e+02 -2.63125065e+02  2.26366871e+02
  -3.22109266e+01]
 [ 3.70270956e-01  1.72421442e+00 -1.24682825e-01 -3.04227797e-02
   8.21672190e+02  1.07864799e+02  5.24890984e-02  5.15162217e-02
  -2.84480199e-01]
 [-3.23844062e-01  3.00330956e-01  5.62208233e-01  7.05718662e-02


In [6]:
# checking using pandas cov function
covMatrix = housing_df.cov()
print(covMatrix)

                MedInc     HouseAge    AveRooms  AveBedrms    Population  \
MedInc        3.609323    -2.846140    1.536568  -0.055858  1.040098e+01   
HouseAge     -2.846140   158.396260   -4.772882  -0.463718 -4.222271e+03   
AveRooms      1.536568    -4.772882    6.121533   0.993868 -2.023337e+02   
AveBedrms    -0.055858    -0.463718    0.993868   0.224592 -3.552723e+01   
Population   10.400979 -4222.270582 -202.333712 -35.527225  1.282470e+06   
AveOccup      0.370289     1.724298   -0.124689  -0.030424  8.217120e+02   
Latitude     -0.323860     0.300346    0.562235   0.070575 -2.631378e+02   
Longitude    -0.057765    -2.728244   -0.136518   0.012670  2.263778e+02   
MedHouseVal   1.508475     1.533988    0.433826  -0.025539 -3.221249e+01   

               AveOccup    Latitude   Longitude  MedHouseVal  
MedInc         0.370289   -0.323860   -0.057765     1.508475  
HouseAge       1.724298    0.300346   -2.728244     1.533988  
AveRooms      -0.124689    0.562235   -0.136518   

#### Question 7

In [7]:
#sigmoid function
def sigmoid_f(z):
    return 1 / (1 + np.exp(-z))

In [8]:
#hypothesis
def classifer_f(X, theta):
    return sigmoid_f(np.dot(X, theta))

In [9]:
#entropy
def binary_loss(y, y_pred): #y_pred = classifer_f??
    m = len(y)
    return - (1 / m) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))


In [10]:
# stochastic gradient descent 
def gradient_f(X, y, theta, alpha, iterations):
    m = len(y)

    for i in range(iterations):
        y_pred = classifer_f(X, theta)
        gradient = np.dot(X.T, (y_pred - y)) / m
        theta -= alpha * gradient
    return theta

#### Question 8

In [11]:
# optimizer function

def optimizer_f(X, y, alpha, iterations):
    
    X = np.c_[np.ones(X.shape[0]), X] #col of 1s for intercept (for matrix multiplication)
    theta = np.zeros(X.shape[1]) #intialize theta vector
    
    # run gradient descent
    theta_opt = gradient_f(X, y, theta, alpha, iterations)
    
    return theta_opt, X, y

In [12]:
#running logit regression 

data = load_breast_cancer()
#print(data.data.shape), output was (569,30)

y = data.target 
X = data.data #other numerical features 

#normalising
y = (y - y.min()) / (y.max() - y.min())

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

#logit regression
theta_opt, X_norm, y_norm = optimizer_f(X, y, alpha=0.1, iterations=10000) #using max of 10000 iterations

#final equation
feature_names = ["Intercept"] + list(data.feature_names)
equation = " + ".join(f"{theta_opt[i]:.4f}*{feature_names[i]}" for i in range(len(theta_opt)))

#ranking coefficients from pos to neg
coef_ranking = sorted(zip(feature_names, theta_opt), key=lambda x: x[1], reverse=True)


print(f"\nFinal Logistic Regression Equation:\n{equation}\n")
print("Feature Coefficients (Ranked from Positive to Negative):")
for feature, coef in coef_ranking:
    print(f"{feature}: {coef:.4f}")



Final Logistic Regression Equation:
9.7400*Intercept + -1.2493*mean radius + -1.9910*mean texture + -1.3578*mean perimeter + -1.8667*mean area + 0.3557*mean smoothness + -0.4573*mean compactness + -2.9788*mean concavity + -4.1637*mean concave points + 0.3137*mean symmetry + 2.6421*mean fractal dimension + -2.4517*radius error + 0.3583*texture error + -1.8514*perimeter error + -1.7106*area error + 0.3479*smoothness error + 1.5651*compactness error + 0.7751*concavity error + 0.3495*concave points error + 0.8047*symmetry error + 1.3182*fractal dimension error + -3.0011*worst radius + -3.1849*worst texture + -2.7465*worst perimeter + -2.7538*worst area + -1.6942*worst smoothness + -1.0459*worst compactness + -2.0364*worst concavity + -3.9194*worst concave points + -1.6949*worst symmetry + -0.2300*worst fractal dimension

Feature Coefficients (Ranked from Positive to Negative):
Intercept: 9.7400
mean fractal dimension: 2.6421
compactness error: 1.5651
fractal dimension error: 1.3182
symmet

##### Final Logistic Regression Equation: 
9.7400*(Intercept) + -1.2493*(mean radius) + -1.9910*(mean texture) + -1.3578*(mean perimeter) + -1.8667*(mean area) + 0.3557*(mean smoothness) + -0.4573*(mean compactness) + -2.9788*(mean concavity) + -4.1637*(mean concave points) + 0.3137*(mean symmetry) + 2.6421*(mean fractal dimension) + -2.4517*(radius error) + 0.3583*(texture error) + -1.8514*(perimeter) error + -1.7106*(area error) + 0.3479*(smoothness error) + 1.5651*(compactness error) + 0.7751*(concavity error) + 0.3495*(concave points error) + 0.8047*(symmetry error) + 1.3182*(fractal dimension error) + -3.0011*(worst radius) + -3.1849*(worst texture) + -2.7465*(worst perimeter) + -2.7538*(worst area) + -1.6942*(worst smoothness) + -1.0459*(worst compactness) + -2.0364*(worst concavity) + -3.9194*(worst concave points) + -1.6949*(worst symmetry) + -0.2300*(worst fractal dimension)

#### Feature Coefficients (Ranked from Positive to Negative):

Intercept: 9.7400

mean fractal dimension: 2.6421

compactness error: 1.5651

fractal dimension error: 1.3182

symmetry error: 0.8047

concavity error: 0.7751

texture error: 0.3583

mean smoothness: 0.3557

concave points error: 0.3495

smoothness error: 0.3479

mean symmetry: 0.3137

worst fractal dimension: -0.2300

mean compactness: -0.4573

worst compactness: -1.0459

mean radius: -1.2493

mean perimeter: -1.3578

worst smoothness: -1.6942

worst symmetry: -1.6949

area error: -1.7106

perimeter error: -1.8514

mean area: -1.8667

mean texture: -1.9910

worst concavity: -2.0364

radius error: -2.4517

worst perimeter: -2.7465

worst area: -2.7538

mean concavity: -2.9788

worst radius: -3.0011

worst texture: -3.1849

worst concave points: -3.9194

mean concave points: -4.1637
