In [278]:
from collections import defaultdict
from statistics import mean
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from util import get_preprocessed_train_data

In [279]:
data = get_preprocessed_train_data()
data.describe()

Unnamed: 0.1,Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,G_avg
count,355.0,355.0,355.0,355.0,355.0,355.0,355.0,333.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0
mean,177.0,16.698592,2.740845,2.540845,1.430986,2.028169,0.338028,3.93994,3.219718,3.138028,1.484507,2.332394,3.557746,5.667606,1.933627,1.918486,1.90088,1.917664
std,102.623909,1.2742,1.092033,1.079022,0.695139,0.829741,0.746814,0.903138,1.009646,1.127765,0.880846,1.29403,1.403754,8.126955,0.203979,0.233361,0.28347,0.228374
min,0.0,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,1.5625,1.25,1.25,1.354167
25%,88.5,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,1.75,1.8125,1.8125,1.770833
50%,177.0,17.0,3.0,3.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,1.9375,1.9375,1.9375,1.916667
75%,265.5,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,2.0625,2.0625,2.0625,2.083333
max,354.0,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,2.4375,2.4375,2.5,2.458333


In [280]:
numeric_features = ['age', 'traveltime', 'absences']
categorical_features = ['school', 'address', 'famsize', 'Medu', 'Fedu', 'studytime', 'failures', 'schoolsup', 'famsup', 'higher', 'internet', 'freetime', 'goout', 'Dalc', 'Walc', 'health' ]
categorical_features = ['studytime']

In [281]:
def train_and_predict(numeric_features, categorical_features, verbose=0):

    df = data[['G3'] + numeric_features + categorical_features].dropna()

    X_data = df.drop(columns=['G3'])[
        numeric_features + categorical_features
    ]
    y_data = df['G3'].to_numpy()

    pipeline = Pipeline([
        ('pre', make_column_transformer((OneHotEncoder(handle_unknown='ignore'), categorical_features), remainder='passthrough')),
        ('clf', LinearRegression())
    ])

    pipeline.fit(X_data, y_data)

    y_pred_train = pipeline.predict(X_data)
    y_pred_cv = cross_val_predict(pipeline, X_data, y_data, cv=5)

    return y_data, y_pred_cv, y_pred_train


def display_results(y_data, y_dev_pred):
    for name, f in {'mae': mean_absolute_error,
                    'mse': mean_squared_error}.items():
        print(name + '\t', f(y_data, y_dev_pred))



In [282]:
y_data, y_pred_cv, y_pred_train = train_and_predict(
    ['age', 'traveltime', 'absences'],
    []
)

print("On Train Set")
display_results(y_data, y_pred_train)
print("On Validation Set (Cross Validation)")
display_results(y_data, y_pred_cv)

# Same as prev notebook

On Train Set
mae	 0.2080961850297854
mse	 0.07722089167412391
On Validation Set (Cross Validation)
mae	 0.21099255200696587
mse	 0.07966561203969015


In [283]:
y_data, y_pred_cv, y_pred_train = train_and_predict(
    ['age', 'traveltime', 'absences'],
    [ 'school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'guardian', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel'],
)

print("On Train Set")
display_results(y_data, y_pred_train)
print("On Validation Set (Cross Validation)")
display_results(y_data, y_pred_cv)

# Why is this getting worse?

On Train Set
mae	 0.18367396255201776
mse	 0.05713150893101234
On Validation Set (Cross Validation)
mae	 0.22067593478126302
mse	 0.08319697123122163


In [284]:
y_data, y_pred_cv, y_pred_train = train_and_predict(
    ['age', 'absences'],
    ['studytime', 'failures']
)

print("On Train Set")
display_results(y_data, y_pred_train)
print("On Validation Set (Cross Validation)")
display_results(y_data, y_pred_cv)

On Train Set
mae	 0.19621281873602164
mse	 0.06700763075366947
On Validation Set (Cross Validation)
mae	 0.20351003307002266
mse	 0.07265859860712855
