In [83]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mutual_info_score, mean_absolute_percentage_error, mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [5]:
data = pd.read_csv('data.csv', delimiter=';')
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,16,U,LE3,T,4,3,teacher,services,...,5,4,3,1,2,1,2,16,15,15
1,GP,M,18,U,LE3,T,1,1,other,other,...,2,3,5,2,5,4,0,6,5,0
2,GP,M,17,R,LE3,A,4,4,teacher,other,...,3,3,3,2,3,4,2,10,11,12
3,GP,F,15,U,LE3,T,3,2,services,other,...,4,4,4,1,1,5,10,7,6,6
4,GP,M,16,U,GT3,T,2,3,other,other,...,5,3,3,1,1,3,0,13,14,14


In [120]:
X = data.drop(['G1', 'G2', 'G3'], axis =1)
y = data['G3']


In [121]:
#manually binarize bc the column transformer doesn't like label binarizer 
lb = LabelBinarizer()
for x in  ['school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid','activities','nursery','internet','higher','romantic']:
    X[x] = lb.fit_transform(X[x])

In [122]:
#engineered features: 
X["Pedu"] = (X['Medu'] + X['Fedu'])/2 #parent education avg
X['Talc'] = (X['Dalc'] + X['Walc'])/2 #total alc consumption 
X['social'] = X['goout'] + X['romantic'] + X['famrel'] #sum of goout and romantic and famrel, max value of 11

In [130]:
#OHE, will happen after test/train/split
ohe = OneHotEncoder(drop ='first')
ord = OrdinalEncoder()

col_trans = ColumnTransformer([('onehot', ohe, ['Mjob', 'Fjob', 'reason', 'guardian']), 
                               ('scaler', StandardScaler(), ['age', 'absences'])],
                                    remainder = 'passthrough')
ord_trans = ColumnTransformer([('ord', ord, ['Mjob', 'Fjob', 'reason', 'guardian'])],
                                    remainder = 'passthrough')

In [132]:
#split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 1)

In [93]:
X_train = pd.DataFrame(ord_trans.fit_transform(X_train),columns = [s.split('__')[1] for s in ord_trans.get_feature_names_out()])
X_test = pd.DataFrame(ord_trans.fit_transform(X_test),columns = [s.split('__')[1] for s in ord_trans.get_feature_names_out()])


In [133]:
#OHC
X_train = pd.DataFrame(col_trans.fit_transform(X_train),columns = [s.split('__')[1] for s in col_trans.get_feature_names_out()])
X_test = pd.DataFrame(col_trans.fit_transform(X_test),columns = [s.split('__')[1] for s in col_trans.get_feature_names_out()])

Unnamed: 0,age,absences
0,0.986294,-0.810219
1,0.986294,-0.810219
2,-0.486483,0.633801
3,0.249906,2.655429
4,0.986294,-0.810219
...,...,...
216,1.722683,0.633801
217,0.986294,0.489399
218,0.986294,-0.088209
219,0.249906,-0.232611


In [32]:
# finding best feature selection for a given model
def test_features(model, selectors): 
    stats = pd.DataFrame(columns = ['selector', 'features', 'r2', 'mse', 'mae', 'relative error'])
    for selector in selectors:
        selector.fit(X_train, y_train)
        features = list(selector.get_feature_names_out())
        train_temp = X_train[features]
        test_temp = X_test[features]
        model.fit(train_temp, y_train)
        pred = model.predict(test_temp)
        r2 = r2_score(y_test, pred)
        mse = mean_squared_error(y_test, pred)
        #a proxy for percent error- need to fix
        mae = mean_absolute_error(y_test, pred)
        relative_e = mae/y_test.mean()
        stats.loc[len(stats.index)]= [str(selector), features, r2, mse, mae, relative_e]
    return(stats)


In [109]:
#get the best feature combination for each model type 
def compare_models(models, selectors, metric):
    stats = pd.DataFrame(columns = ['model', 'k features', 'features', 'r2', 'mse', 'mae', 'relative error'])
    for m in models:
        temp = test_features(m, selectors)
        if metric == 'r2': 
            best = [str(m)] + temp[temp[metric] == temp[metric].max()].values.tolist()[0]
        else: best = [str(m)] + temp[temp[metric] == temp[metric].min()].values.tolist()[0]
        best[1] = len(best[2])
        stats.loc[len(stats.index)] = best
    stats = stats.sort_values(by = metric).reset_index(drop=True)
    return stats 


In [34]:
#tune model
def tune_model(model, x, params):
    tuner = GridSearchCV(estimator=model, param_grid = params, scoring= 'r2')
    tuner.fit(x,y_train)
    return pd.DataFrame(tuner.cv_results_).sort_values(by='rank_test_score')[['params', 'mean_test_score']]

In [102]:
#models and selectors 
models = [SVR(), SGDRegressor(), RandomForestRegressor(), Ridge(), KNeighborsRegressor(), GradientBoostingRegressor()]
selectors = [SelectKBest(k = i, score_func = mutual_info_regression) for i in range (3, len(X_train.columns)+1)]

In [137]:
params = {"SVR()": {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], #SVR
          'gamma' : [10**x/10000 for x in range (3)],
          'C' : [10**x/100 for x in range (0,3)]},
          "SGDRegressor()": {'loss' : ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],   #SGD
           'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']   
          },
          "RandomForestRegressor()": {'n_estimators' : range(10, 160, 10),     #Random Forest
           'max_features' : ['sqrt', 'log2', None]
          },
          "Ridge()": {'alpha' : [10**x/100000 for x in range(0, 10,2)]  #Ridge
          },
          "KNeighborsRegressor()": {'n_neighbors' : range(2,21),     #KNeighbors
           'weights'  : ['uniform', 'distance']     
          },
          "GradientBoostingRegressor()": {
            'loss' : ['squared_error', 'absolute_error', 'huber', 'quantile'],  #Gradient Boosting
            'n_estimators' : range(10, 160, 10)
          }
}

In [134]:
best_features = compare_models(models, selectors, 'relative error')


In [135]:
best_features.head(6)

Unnamed: 0,model,k features,features,r2,mse,mae,relative error
0,RandomForestRegressor(),33,"[Mjob_other, Mjob_teacher, Fjob_health, Fjob_s...",0.136898,18.360877,2.995053,0.269697
1,KNeighborsRegressor(),33,"[Mjob_teacher, Fjob_services, reason_other, re...",0.164689,17.769684,3.067368,0.276209
2,GradientBoostingRegressor(),17,"[Mjob_services, Fjob_health, absences, address...",0.156221,17.949826,3.088898,0.278147
3,SVR(),14,"[Fjob_health, Fjob_services, age, absences, ad...",0.126624,18.579443,3.176537,0.286039
4,SGDRegressor(),4,"[Mjob_services, absences, failures, higher]",0.121593,18.686468,3.278194,0.295193
5,Ridge(),7,"[Fjob_health, Fjob_services, absences, address...",0.100078,19.144167,3.339251,0.300691


In [138]:
#tune hyperparameters for top 3 models
stats = {}
for i in (best_features.index[:3]): 
    model = eval(best_features.iloc[i]['model'])
    features = best_features.iloc[i]['features']
    hps = params[best_features.iloc[i]['model']]
    results = tune_model(model, X_train[features], hps)
    print(f"doing {model}")
    stats[(str(model), str(features))] = results

doing RandomForestRegressor()
doing KNeighborsRegressor()
doing GradientBoostingRegressor()


In [139]:
#apply tuned hyperparameters 
tuned_stats = pd.DataFrame(columns = ['model', 'params','n','features', 'r2', 'mse', 'mae', 'relative error'])
for k,v in stats.items():
    params = v['params'][0]
    features = eval(k[1])
    model = eval(k[0][:-2])(**params)
    model.fit(X_train[features], y_train)
    pred = model.predict(X_test[features])
    r2 = r2_score(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    relative_e = mae/y_test.mean()
    tuned_stats.loc[len(tuned_stats.index)] = [k[0], params, len(features), features, r2, mse, mae, relative_e]

In [141]:
tuned_stats.head(6)

Unnamed: 0,model,params,n,features,r2,mse,mae,relative error
0,RandomForestRegressor(),"{'max_features': 'sqrt', 'n_estimators': 10}",33,"[Mjob_other, Mjob_teacher, Fjob_health, Fjob_s...",-0.014361,21.578632,3.438947,0.309668
1,KNeighborsRegressor(),"{'n_neighbors': 2, 'weights': 'uniform'}",33,"[Mjob_teacher, Fjob_services, reason_other, re...",-0.0192,21.681579,3.426316,0.308531
2,GradientBoostingRegressor(),"{'loss': 'squared_error', 'n_estimators': 10}",17,"[Mjob_services, Fjob_health, absences, address...",0.073446,19.710697,3.269748,0.294432
