# Weighted Group Lasso Experiments
CSC 2515 Fall 2022

In [1]:
import numpy as np
from matplotlib import pyplot as plt
from groupyr import LogisticSGL
from groupyr.logistic import WeightedLogisticSGL
import pandas as pd
from pca import (pca, logistic_pca)
import linearcorex as lc
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import ConfusionMatrixDisplay
import sklearn.metrics as metrics
from sklearn.multiclass import OneVsRestClassifier

Install CUDA and cudamat (for python) to enable GPU speedups.


In [2]:
def mark_as_categorical(dataframe: pd.DataFrame, category: str):
    dataframe[category] = dataframe[category].astype('category')

def get_categories(dataframe: pd.DataFrame):
    return [col for col in dataframe.select_dtypes(include="category")]

In [3]:
def create_groups_from_1hot(data_frame: pd.DataFrame):
    expanded = pd.get_dummies(data_frame)
    groups = {col: [] for col in data_frame}
    for idx, col in enumerate(expanded):
        category = col.split("_")[0]
        groups[category].append(idx)

    groups = {k: np.array(v) for k, v in groups.items()}
    return expanded, groups

In [4]:
def expand_data(data):
    mark_as_categorical(data, "MaritalStatus")
    mark_as_categorical(data, "ApplicationMode")
    mark_as_categorical(data, "ApplicationOrder")
    mark_as_categorical(data, "TimeOfDay")
    mark_as_categorical(data, "PreviousQualification")
    mark_as_categorical(data, "Nationality")
    mark_as_categorical(data, "MotherQualification")
    mark_as_categorical(data, "FatherQualification")
    mark_as_categorical(data, "MotherOccupation")
    mark_as_categorical(data, "FatherOccupation")
    mark_as_categorical(data, "Course")

    target = data.get("Target").replace(['Dropout', 'Graduate', 'Enrolled'], [0, 1, 2]).astype(float)

    #Scaling
    for col in data.select_dtypes(include=["float",'int'], exclude="category"):
        data[col] /= data[col].max()

    #Hot 1 and Grouping
    expanded_X, group_idxs = create_groups_from_1hot(data.drop(columns="Target"))

    return expanded_X, group_idxs, target


def stage_data(data, stage):
    # staged data feed
    if stage == 'sem2':
      features = data
    elif stage == 'sem1':
      features = data.drop(columns = ['Curricular units 2nd sem (credited)',
  'Curricular units 2nd sem (enrolled)',
  'Curricular units 2nd sem (evaluations)',
  'Curricular units 2nd sem (approved)',
  'Curricular units 2nd sem (grade)',
  'Curricular units 2nd sem (without evaluations)'])
    elif stage == 'registration':
      features = data.drop(columns = ['Curricular units 1st sem (credited)',
  'Curricular units 1st sem (enrolled)',
  'Curricular units 1st sem (evaluations)',
  'Curricular units 1st sem (approved)',
  'Curricular units 1st sem (grade)',
  'Curricular units 1st sem (without evaluations)',
  'Curricular units 2nd sem (credited)',
  'Curricular units 2nd sem (enrolled)',
  'Curricular units 2nd sem (evaluations)',
  'Curricular units 2nd sem (approved)',
  'Curricular units 2nd sem (grade)',
  'Curricular units 2nd sem (without evaluations)'])

    expanded_X, group_idxs, target = expand_data(features)
    X = expanded_X.to_numpy(np.float64)
    y = target.to_numpy(np.float64)
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

    return X_train,X_test,y_train,y_test, group_idxs

In [5]:
def standard_metrics(model, X_train, X_test, y_test, y_train, label=""):
    y_pred = model.predict(X_test)
    print("Stats for", label)
    print(metrics.confusion_matrix(y_test, y_pred))
    print("F1 score", metrics.f1_score(y_test, y_pred, average=None))
    print("Avg F1 score", metrics.f1_score(y_test, y_pred, average='macro'))
    print("Training Score", model.score(X_train, y_train))
    print("Testing Score", model.score(X_test, y_test))

In [6]:
kid_data = pd.read_csv("data.csv", delimiter=";")

In [7]:
mark_as_categorical(kid_data, "MaritalStatus")
mark_as_categorical(kid_data, "ApplicationMode")
mark_as_categorical(kid_data, "ApplicationOrder")
mark_as_categorical(kid_data, "TimeOfDay")
mark_as_categorical(kid_data, "PreviousQualification")
mark_as_categorical(kid_data, "Nationality")
mark_as_categorical(kid_data, "MotherQualification")
mark_as_categorical(kid_data, "FatherQualification")
mark_as_categorical(kid_data, "MotherOccupation")
mark_as_categorical(kid_data, "FatherOccupation")
mark_as_categorical(kid_data, "Course")

for col in kid_data.select_dtypes(include=["float", "int"], exclude="category"):
    kid_data[col] /= kid_data[col].max()

expanded_X, group_idxs = create_groups_from_1hot(kid_data.drop(columns="Target"))

In [8]:
kid_data.get("Target").value_counts()

Graduate    2209
Dropout     1421
Enrolled     794
Name: Target, dtype: int64

In [9]:
from groupyr import LogisticSGLCV
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test, group_idxs = stage_data(kid_data, "sem2")
print(X_train.shape)
print(y_train.shape)

(3539, 255)
(3539,)


Study of OnevRest

In [12]:
model = LogisticRegression(penalty='none', max_iter=10000)
one_v_all_log_r = OneVsRestClassifier(model).fit(X_train, y_train)
model = model.fit(X_train, y_train)

standard_metrics(one_v_all_log_r, X_train, X_test, y_test, y_train, "OneVsRest")
standard_metrics(model, X_train, X_test, y_test, y_train, "Softmax")

Stats for OneVsRest
[[207  48  29]
 [ 18 405  19]
 [ 35  73  51]]
F1 score [0.76102941 0.83677686 0.39534884]
Avg F1 score 0.6643850361593802
Training Score 0.8095507205425262
Testing Score 0.7491525423728813
Stats for Softmax
[[205  48  31]
 [ 20 404  18]
 [ 34  69  56]]
F1 score [0.75506446 0.83904465 0.42424242]
Avg F1 score 0.672783844364368
Training Score 0.8157671658660639
Testing Score 0.751412429378531


In [11]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb = GradientBoostingClassifier()
ovr_gb = OneVsRestClassifier(model_gb).fit(X_train, y_train)
model_gb.fit(X_train, y_train)

standard_metrics(model_gb, X_train, X_test, y_test, y_train, "GB Softmax")
standard_metrics(ovr_gb, X_train, X_test, y_test, y_train, "GB OvR")

Stats for GB Softmax
[[207  43  34]
 [ 15 414  13]
 [ 33  70  56]]
F1 score [0.76808905 0.85448916 0.42748092]
Avg F1 score 0.683353044640187
Training Score 0.842045775642837
Testing Score 0.7649717514124293
Stats for GB OvR
[[211  42  31]
 [ 18 413  11]
 [ 41  74  44]]
F1 score [0.76173285 0.85066941 0.35918367]
Avg F1 score 0.6571953128104202
Training Score 0.834416501836677
Testing Score 0.7548022598870057


In [18]:
from sklearn.model_selection import GridSearchCV
from groupyr.logistic import WeightedLogisticSGL


reg_hyper_params = {
'estimator__stretching': np.linspace(1, 2, 5),
'estimator__alpha': np.logspace(-3, -1, 15)
}

unreg_hyper_params = {
'estimator__stretching': np.linspace(1, 2, 5),
}
for stage in ['registration', 'sem1', 'sem2']:
    X_train, X_test, y_train, y_test, group_idxs = stage_data(kid_data, stage)

    unreg_model = OneVsRestClassifier(WeightedLogisticSGL(alpha=0.0, max_iter=10000))
    l2_model = OneVsRestClassifier(WeightedLogisticSGL(groups=None, l1_ratio=0, max_iter=10000))
    l1_model = OneVsRestClassifier(WeightedLogisticSGL(groups=None, l1_ratio=1, max_iter=10000))
    group_model = OneVsRestClassifier(WeightedLogisticSGL(groups=list(group_idxs.values()), l1_ratio=0, max_iter=10000))

    clf_un = GridSearchCV(unreg_model, unreg_hyper_params, n_jobs=-1, cv=5,scoring = 'f1_macro').fit(X_train, y_train)
    clf_l2 = GridSearchCV(l2_model, reg_hyper_params, n_jobs=-1, cv=5,scoring = 'f1_macro').fit(X_train, y_train)
    clf_l1 = GridSearchCV(l1_model, reg_hyper_params, n_jobs=-1, cv=5,scoring = 'f1_macro').fit(X_train, y_train)
    clf_group = GridSearchCV(group_model, reg_hyper_params, n_jobs=-1, cv=5,scoring = 'f1_macro').fit(X_train, y_train)

    standard_metrics(clf_un, X_train, X_test, y_test, y_train, label=f"Unregularized {stage}")
    standard_metrics(clf_l2, X_train, X_test, y_test, y_train, label=f"Ridge {stage}")
    standard_metrics(clf_l1, X_train, X_test, y_test, y_train, label=f"Lasso {stage}")
    standard_metrics(clf_group, X_train, X_test, y_test, y_train, label=f"Group {stage}")


Stats for Unregularized registration
[[190  47  47]
 [ 59 308  75]
 [ 40  63  56]]
F1 score [0.66317627 0.71627907 0.33234421]
Avg F1 score 0.5705998495625998
Training Score 0.6226435188288603
Testing Score 0.5705998495625998
Stats for Ridge registration
[[173  44  67]
 [ 51 299  92]
 [ 30  51  78]]
F1 score [0.64312268 0.715311   0.39393939]
Avg F1 score 0.5841243584346696
Training Score 0.6013581315332565
Testing Score 0.5841243584346696
Stats for Lasso registration
[[173  44  67]
 [ 51 299  92]
 [ 30  51  78]]
F1 score [0.64312268 0.715311   0.39393939]
Avg F1 score 0.5841243584346696
Training Score 0.6013581315332565
Testing Score 0.5841243584346696
Stats for Group registration
[[173  44  67]
 [ 51 299  92]
 [ 30  51  78]]
F1 score [0.64312268 0.715311   0.39393939]
Avg F1 score 0.5841243584346696
Training Score 0.6013581315332565
Testing Score 0.5841243584346696
Stats for Unregularized sem1
[[210  27  47]
 [ 30 360  52]
 [ 42  53  64]]
F1 score [0.74204947 0.81632653 0.39751553]
A

In [None]:
from groupyr import sgl_path

groups = list(group_idxs.values())
alphas = np.logspace(-4, 0, 200)
path_coefs, path_alphas, path_iters = sgl_path(
    X_train, y_train, l1_ratio=0, groups=groups, alphas=alphas, eps=0.00001, n_alphas=200, max_iter=1000, tol=1e-3
)

group_means = np.array([np.linalg.norm(path_coefs[grp], axis=0) for grp in groups])

zero_idx = np.zeros((len(groups), 1), dtype=int
    #[np.max(np.where(group_means[i] == 0)[0]) for i in range(len(groups))]
)

fig, ax = plt.subplots(2, 1, figsize=(8, 10), sharex=True)

cmap = plt.get_cmap("tab20")

for grp, color, z_idx in zip(groups, cmap.colors, zero_idx):
    _ = ax[0].semilogx(
        path_alphas, np.abs(path_coefs[grp][:-1].transpose()), color=color
    )

    _ = ax[0].axvline(path_alphas[z_idx], ls=":", color=color)
    _ = ax[1].axvline(path_alphas[z_idx], ls=":", color=color)

_ = ax[1].semilogx(path_alphas, group_means.transpose())

_ = ax[1].set_xlabel(r"$\log(\alpha)$", fontsize=16)
_ = ax[0].set_ylabel(r"$\left| \hat{\beta} \right|$", fontsize=16)
_ = ax[1].set_ylabel(
    r"$\left| \left| \hat{\beta}^{(\ell)} \right| \right|_2$", fontsize=16
)
_ = ax[0].set_title(r"SGL regularization path", fontsize=16)
plt.show()