In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('feature_matrix.csv')
ages = pd.read_csv('ages.csv')

In [3]:
X = df.copy(); y = ages.copy()
X.shape, y.shape

((22, 70), (22, 1))

In [4]:
# let's try a baseline linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [5]:
# prepare the train and val data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
lr_model = LinearRegression(fit_intercept = True)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

In [7]:
lr_model.coef_.shape # a knob for each feature

(1, 70)

In [8]:
lr_model.intercept_

array([6.14771555])

In [9]:
# evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [10]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Absolute Error: 15.13
Mean Squared Error: 504.75
R-squared: -0.58


Terrible performance. Can we do better?

In [11]:
# let's see if we can get better by combining EO and EC
prefixes = set()
for col in df.columns:
    prefixes.add(col[0 : -2])
len(prefixes)

35

In [12]:
averages_df = pd.DataFrame()
for prefix in prefixes:
    ec_col = f"{prefix}EC"
    eo_col = f"{prefix}EO"
    averages_df[prefix[0:-1]] = df[[ec_col, eo_col]].mean(axis=1)

In [13]:
averages_df.shape

(22, 35)

In [14]:
def evaluate_metrics(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.2f}")

    return mae, mse, r2

In [15]:
X_train_avg, X_test_avg, y_train_avg, y_test_avg = train_test_split(averages_df, y, test_size=0.2, random_state=42)

lr_model_2 = LinearRegression(fit_intercept=True)
lr_model_2.fit(X_train_avg, y_train_avg)
y_pred_avg = lr_model_2.predict(X_test_avg)
evaluate_metrics(y_test_avg, y_pred_avg)

Mean Absolute Error: 35.26
Mean Squared Error: 1596.63
R-squared: -3.99


(35.26091780343794, 1596.6329338202072, -3.9915446672576245)

Intersting: this is even worse. 