TODO:

1. Try to compute pseudoinverses using `numpy`
1. Try to fit a multiple linear model using perfectly collinear data and see if it really uses the pseudoinverse.
1. Build a ridge/lasso model.

In [106]:
# import modules

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import sklearn.linear_model as skl_lm

#### California housing dataset using `sklearn`

In [107]:
# read in California housing dataset
from sklearn.datasets import fetch_california_housing
housing=fetch_california_housing()

In [108]:
from sklearn.model_selection import train_test_split

# create dataframe for feature and target
X=pd.DataFrame(housing.data, columns=housing.feature_names)[["AveRooms"]]
y=housing.target # Median house value in $100,000s

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2, random_state=42)
# we set random_state=42 kinda like setting a seed, so that the results are reproducible
# otherwise, everytime we run the code we would get a different result
# test_size=0.2 means that 0.2 of the data is for testing, 0.8 is for training

In [109]:
X.head()

Unnamed: 0,AveRooms
0,6.984127
1,6.238137
2,8.288136
3,5.817352
4,6.281853


In [110]:
from sklearn.preprocessing import StandardScaler
# (this is a data preprocessing tool to remove the mean 
# and scale features to unit variance to prevent certain
# features from dominating the model due to differences in scale)

# instantiate StandardScaler
scaler=StandardScaler()

# Fit and transform training data and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

The scaler is fitted on the training data using the `fit_transform()` method. The test data is then transformed separately using the `transform()` method to ensure it is scaled using the same factors as the training data, preventing data leakage.

In [111]:
from sklearn.linear_model import LinearRegression

# instantiate linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [112]:
# Make predictions on the testing data
y_pred = model.predict(X_test_scaled)

In [113]:
# Import metrics
from sklearn.metrics import mean_squared_error, r2_score

# Calculate and print R^2 score
r2=r2_score(y_test, y_pred)
print(f"R-squared: {r2:.4f}")

R-squared: 0.0138


In [114]:
# Calculate and print MSE
mse=mean_squared_error(y_test, y_pred)
print(f"Mean squared error: {mse:.4f}")

Mean squared error: 1.2923


In [115]:
# Calculate and print RMSE
rmse = mse ** 0.5
print(f"Root mean squared error: {rmse:.4f}")

Root mean squared error: 1.1368


### Multiple Linear Regression

Now instead of regressing median house value onto just AveRooms, we try using all of the available features instead in a multiple regression model.

In [116]:
X2=pd.DataFrame(housing.data, columns=housing.feature_names)
y=housing.target
X2_train, X2_test, y_train, y_test=train_test_split(X2,y,test_size=0.2, random_state=42)

# Scale the data
scaler=StandardScaler()

X2_train_scaled=scaler.fit_transform(X2_train)
X2_test_scaled=scaler.transform(X2_test)

# Create model and fit it to the training data
model2=LinearRegression()
model2.fit(X2_train_scaled, y_train)

# Make predictions
y_pred=model2.predict(X2_test_scaled)

# Calculate and print errors
r2 = r2_score(y_test, y_pred)
print(f"R squared:{r2:.4f}")

mse = mean_squared_error(y_test, y_pred)
print(f"Mean squared error:{mse:.4f}")

rmse = mse**0.5
print(f"Root mean squared error:{rmse:.4f}")


R squared:0.5758
Mean squared error:0.5559
Root mean squared error:0.7456


### Multicollinearity

In [117]:
# Compute correlation matrix
corr_matrix = X2.corr()
corr_matrix

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0


In [118]:
print(corr_matrix.iloc[1,2]) # quick revision
print(corr_matrix.loc['HouseAge','AveRooms'])
print(X2.columns)

-0.15327742256198923
-0.15327742256198923
Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')


On the diagonals we have the correlation of each feature with themselves which makes sense to be 1. We see that the average number of bedrooms seems to be very positively correlated with the average number of rooms, which makes sense. 

Identifying highly correlated pairs may not be something we want to do manually when the number of features is large. So

In [119]:
high_corr_features = [(col1, col2, corr_matrix.loc[col1,col2])
                      for col1 in corr_matrix.columns
                      for col2 in corr_matrix.columns
                      if col1!=col2 and abs(corr_matrix.loc[col1,col2])>0.8
                      ]

collinearity_df=pd.DataFrame(high_corr_features, columns=["Feature1", "Feature2", "Correlation"])
print("\nHighly Correlated Features:\n",collinearity_df)


Highly Correlated Features:
     Feature1   Feature2  Correlation
0   AveRooms  AveBedrms     0.847621
1  AveBedrms   AveRooms     0.847621
2   Latitude  Longitude    -0.924664
3  Longitude   Latitude    -0.924664


In [120]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Compute VIF
vif_data=pd.DataFrame()
vif_data["Feature"]=X2.columns
vif_data["VIF"]=[variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]

# Print VIF
print("\nVariance Inflation Factor (VIF) for each feature:\n", vif_data)


Variance Inflation Factor (VIF) for each feature:
       Feature         VIF
0      MedInc   11.511140
1    HouseAge    7.195917
2    AveRooms   45.993601
3   AveBedrms   43.590314
4  Population    2.935745
5    AveOccup    1.095243
6    Latitude  559.874071
7   Longitude  633.711654


Let's see what happens if we remove `AveBedrms` from the model


In [121]:
X3=X2.drop(columns=["AveBedrms"])

X3_train, X3_test, y_train, y_test=train_test_split(X3,y,train_size=0.8,random_state=42)

# scale data
X3_train_scaled=scaler.fit_transform(X3_train)
X3_test_scaled=scaler.transform(X3_test)

# Create a linear regression model and train it
model3=LinearRegression()
model3.fit(X3_train_scaled,y_train)

# Make predictions
y_pred=model3.predict(X3_test_scaled)

# Calculate performance metrics
R2=r2_score(y_test,y_pred)

In [122]:
X3_train_scaled

array([[-0.326196  ,  0.34849025, -0.17491646, ...,  0.05137609,
        -1.3728112 ,  1.27258656],
       [-0.03584338,  1.61811813, -0.40283542, ..., -0.11736222,
        -0.87669601,  0.70916212],
       [ 0.14470145, -1.95271028,  0.08821601, ..., -0.03227969,
        -0.46014647, -0.44760309],
       ...,
       [-0.49697313,  0.58654547, -0.60675918, ...,  0.02030568,
        -0.75500738,  0.59946887],
       [ 0.96545045, -1.07984112,  0.40217517, ...,  0.00707608,
         0.90651045, -1.18553953],
       [-0.68544764,  1.85617335, -0.85144571, ..., -0.08535429,
         0.99543676, -1.41489815]])

### Experiment time

In [148]:
import pandas as pd
import numpy as np

# Define the number of rows and columns
num_rows = 100
num_cols = 4

# Define column names (optional, but good practice)
column_names = [f'col_{i+1}' for i in range(num_cols)]

# Generate random data using NumPy
# For random integers: np.random.randint(low, high, size=(rows, cols))
# For random floats: np.random.rand(rows, cols) or np.random.uniform(low, high, size=(rows, cols))
random_data = np.random.randint(0, 100, size=(num_rows, num_cols)) # Example: random integers between 0 and 99

# Create the DataFrame
df = pd.DataFrame(random_data, columns=column_names)

for i in range(num_rows):
    df.iloc[i,1]=2*df.iloc[i,0]
    df.iloc[i,num_cols-1]=df.iloc[i,0]+2*df.iloc[i,1]+3*df.iloc[i,2]+np.random.randint(-10,10)

# Print the generated DataFrame
print(df)

    col_1  col_2  col_3  col_4
0      71    142     42    484
1      45     90     10    252
2      57    114     28    372
3      99    198     53    651
4      63    126     54    473
..    ...    ...    ...    ...
95     51    102     62    443
96     45     90     66    420
97     27     54     19    182
98     31     62     95    444
99     60    120     62    494

[100 rows x 4 columns]


In [153]:
X=df.drop(columns=['col_4'])
y=df['col_4']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=42)

# standardise
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

# build model
model=LinearRegression()
model.fit(X_train_scaled,y_train)


In [154]:
print(X_train_scaled.shape)

(80, 3)


In [155]:
print(model.coef_, model.intercept_)

[76.5563963  76.5563963  82.86484358] 407.1875


In [152]:
pinv_arr=np.ones((80,4))
for i in range(X_train_scaled.shape[0]):
    for j in range(4):
        if j<3:
            pinv_arr[i,j]=X_train_scaled[i,j]

pinv=np.linalg.pinv(pinv_arr)
print(np.matmul(pinv,y_train))

[ 76.5563963   76.5563963   82.86484358 407.1875    ]


In [145]:
for i in range(5):
    print(i)

0
1
2
3
4


In [141]:
X_train_scaled[4]

array([ 1.57432664,  1.84283324, -1.01460192])