In [40]:
import pandas as pd
from collections import defaultdict
from statistics import mean
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from util import get_preprocessed_train_data

In [41]:
data = get_preprocessed_train_data()
data.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,355.0,355.0,355.0,355.0,355.0,355.0,333.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0
mean,16.698592,2.740845,2.540845,1.430986,2.028169,0.501408,3.93994,3.492958,3.138028,1.484507,2.332394,3.557746,5.667606,3.734507,3.673944,3.603521
std,1.2742,1.092033,1.079022,0.695139,0.829741,2.481997,0.903138,3.904485,1.127765,0.880846,1.29403,1.403754,8.126955,0.815917,0.933444,1.133881
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,2.25,1.0,1.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,3.0,3.25,3.25
50%,17.0,3.0,3.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,3.75,3.75,3.75
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,4.25,4.25,4.25
max,22.0,4.0,4.0,4.0,4.0,33.0,5.0,56.0,5.0,5.0,5.0,5.0,75.0,5.75,5.75,6.0


In [42]:
def train_and_predict(numeric_features, categorical_features, verbose=0):

    df = data[['G3'] + numeric_features + categorical_features].dropna()

    X_data = df.drop(columns=['G3'])[
        numeric_features + categorical_features
    ]
    y_data = df['G3'].to_numpy()

    pipeline = Pipeline([
        ('pre', make_column_transformer((OneHotEncoder(handle_unknown='ignore'), categorical_features), remainder='passthrough')),
        ('clf', LinearRegression())
    ])

    pipeline.fit(X_data, y_data)

    y_pred_train = pipeline.predict(X_data)
    y_pred_cv = cross_val_predict(pipeline, X_data, y_data, cv=5)

    return y_data, y_pred_cv, y_pred_train


def display_results(y_data, y_dev_pred):
    for name, f in {'mae': mean_absolute_error,
                    'mse': mean_squared_error}.items():
        print(name + '\t', f(y_data, y_dev_pred))



In [43]:
y_data, y_pred_cv, y_pred_train = train_and_predict(
    ['age', 'traveltime', 'absences'],
    []
)

print("On Train Set")
display_results(y_data, y_pred_train)
print("On Validation Set (Cross Validation)")
display_results(y_data, y_pred_cv)

# Same as prev notebook

On Train Set
mae	 0.8323847401191417
mse	 1.2355342667859825
On Validation Set (Cross Validation)
mae	 0.8459688102633277
mse	 1.265531805192244


In [44]:
y_data, y_pred_cv, y_pred_train = train_and_predict(
    ['age', 'traveltime', 'absences'],
    [ 'school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'guardian', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel'],
)

print("On Train Set")
display_results(y_data, y_pred_train)
print("On Validation Set (Cross Validation)")
display_results(y_data, y_pred_cv)

# Why is this getting worse?

On Train Set
mae	 0.734595655780073
mse	 0.9141002862842418
On Validation Set (Cross Validation)
mae	 0.907612728396189
mse	 1.376642004018053


In [45]:
y_data, y_pred_cv, y_pred_train = train_and_predict(
    ['age', 'absences'],
    ['studytime', 'failures']
)

print("On Train Set")
display_results(y_data, y_pred_train)
print("On Validation Set (Cross Validation)")
display_results(y_data, y_pred_cv)

On Train Set
mae	 0.7812422155351836
mse	 1.0681549218397113
On Validation Set (Cross Validation)
mae	 0.8131014336916683
mse	 1.1471462763130837


## Final linear model

In [46]:
numeric_features = ['age', 'absences']
categorical_features = ['studytime', 'failures']

df = data[['G3'] +
    numeric_features + categorical_features
].dropna()

X_train = df.drop(columns=['G3'])
y_train = df['G3'].to_numpy()

pipeline = Pipeline([
    ('pre', make_column_transformer((OneHotEncoder(handle_unknown='ignore'), categorical_features), remainder='passthrough')),
    ('clf', LinearRegression())
])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('pre',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['studytime', 'failures'])])),
                ('clf', LinearRegression())])

In [47]:
X_test = pd.read_csv("test-data.csv", index_col=0)[numeric_features + categorical_features]

In [48]:
X_test.describe()

Unnamed: 0,age,absences,studytime,failures
count,40.0,40.0,40.0,40.0
mean,16.675,2.65,2.1,1.775
std,1.308503,6.735098,0.928191,7.043773
min,15.0,-1.0,1.0,0.0
25%,16.0,-1.0,1.0,0.0
50%,16.5,0.0,2.0,0.0
75%,18.0,3.0,3.0,0.0
max,20.0,38.0,4.0,33.0


In [49]:
y_test_pred = final_model.predict(X_test)

In [50]:
X_test_submission = pd.DataFrame(index=X_test.index)
X_test_submission['G3'] = y_test_pred
X_test_submission.to_csv('linear_submission.csv', header=True, index_label='id')