In [1]:
import numpy as np
from matplotlib import pyplot as plt
from groupyr import LogisticSGL
import pandas as pd


In [2]:
def mark_as_categorical(dataframe: pd.DataFrame, category: str):
    dataframe[category] = dataframe[category].astype('category')

def get_categories(dataframe: pd.DataFrame):
    return [col for col in dataframe.select_dtypes(include="category")]

In [3]:
def create_groups_from_1hot(data_frame: pd.DataFrame):
    expanded = pd.get_dummies(data_frame)
    groups = {col: [] for col in data_frame}
    for idx, col in enumerate(expanded):
        category = col.split("_")[0]
        groups[category].append(idx)

    groups = {k: np.array(v) for k, v in groups.items()}
    return expanded, groups

In [4]:
kid_data = pd.read_csv("data.csv", delimiter=";")

In [5]:
mark_as_categorical(kid_data, "MaritalStatus")
mark_as_categorical(kid_data, "ApplicationMode")
mark_as_categorical(kid_data, "ApplicationOrder")
mark_as_categorical(kid_data, "TimeOfDay")
mark_as_categorical(kid_data, "PreviousQualification")
mark_as_categorical(kid_data, "Nationality")
mark_as_categorical(kid_data, "MotherQualification")
mark_as_categorical(kid_data, "FatherQualification")
mark_as_categorical(kid_data, "MotherOccupation")
mark_as_categorical(kid_data, "FatherOccupation")

expanded_X, group_idxs = create_groups_from_1hot(kid_data.drop(columns="Target"))

In [6]:
target = kid_data.get("Target").replace(['Dropout', 'Graduate', 'Enrolled'], [0, 1, 2]),
kid_data.get("Target").value_counts()

Graduate    2209
Dropout     1421
Enrolled     794
Name: Target, dtype: int64

In [7]:
get_categories(kid_data)

['MaritalStatus',
 'ApplicationMode',
 'ApplicationOrder',
 'TimeOfDay',
 'PreviousQualification',
 'Nationality',
 'MotherQualification',
 'FatherQualification',
 'MotherOccupation',
 'FatherOccupation']

In [42]:
from groupyr import LogisticSGLCV

X = expanded_X.to_numpy(np.float64)[:, :50]
y = kid_data['Target'].replace(['Dropout', 'Graduate', 'Enrolled'], [0, 1, 1]).to_numpy(np.float64)

print(X.shape)
print(y.shape)

model = LogisticSGL(l1_ratio=0.5, alpha=10,
    verbose=2, suppress_solver_warnings=False, max_iter=10000
).fit(X, y)



(4424, 50)
(4424,)


In [43]:
model.score(X, y)

0.6787974683544303

In [44]:
np.mean(y)

0.6787974683544303

In [45]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='none', max_iter=10000).fit(X, y)
model.score(X,y)

0.8790687160940326