In [1]:
import numpy as np
from IPython.core.display import display
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from util import get_preprocessed_train_data

In [2]:
data = get_preprocessed_train_data()

In [3]:
X_data = data.drop(columns='G3')[['G1', 'G2']]
y_data = data['G3'].to_numpy()

In [4]:
X_train, X_dev, y_train, y_dev = train_test_split(X_data, y_data, test_size=0.1, random_state=42)

In [5]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [6]:
pipeline = Pipeline([
    ('clf', LinearRegression())
])

In [7]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', LinearRegression())])

In [8]:
y_dev_pred = pipeline.predict(X_dev)

In [9]:
validations = {
    'mae': mean_absolute_error,
    'mse': mean_squared_error,
    'mape': mean_absolute_percentage_error
}

for name, f in validations.items():
    print(name + '\t', f(y_dev, y_dev_pred))

mae	 0.35630042836297493
mse	 0.42406769746816275
mape	 22.192915201626544


Is this a good result?

In [10]:
# Baseline just guess the average

y_dev_pred_base = (X_dev['G1'].to_numpy() + X_dev['G2'].to_numpy()) / 2

for name, f in validations.items():
    print(name + '\t', f(y_dev, y_dev_pred_base))

mae	 0.3923611111111111
mse	 0.50390625
mape	 26.340438081440404


Prediction for G3 on G1 and G2 is very easy because they have a high correlation.

We therefore will try to predict G_avg = (G1 + G2 + G3) / 3 for the remaining of the lecture as a more interesting problem.