In [1]:
import numpy as np
from matplotlib import pyplot as plt
from groupyr import LogisticSGL
import pandas as pd


In [2]:
def mark_as_categorical(dataframe: pd.DataFrame, category: str):
    dataframe[category] = dataframe[category].astype('category')

def get_categories(dataframe: pd.DataFrame):
    return [col for col in dataframe.select_dtypes(include="category")]

In [3]:
def create_groups_from_1hot(data_frame: pd.DataFrame):
    expanded = pd.get_dummies(data_frame)
    groups = {col: [] for col in data_frame}
    for idx, col in enumerate(expanded):
        category = col.split("_")[0]
        groups[category].append(idx)

    groups = {k: np.array(v) for k, v in groups.items()}
    return expanded, groups

In [19]:
kid_data = pd.read_csv("data.csv", delimiter=";")

In [43]:
mark_as_categorical(kid_data, "MaritalStatus")
mark_as_categorical(kid_data, "ApplicationMode")
mark_as_categorical(kid_data, "ApplicationOrder")
mark_as_categorical(kid_data, "TimeOfDay")
mark_as_categorical(kid_data, "PreviousQualification")
mark_as_categorical(kid_data, "Nationality")
mark_as_categorical(kid_data, "MotherQualification")
mark_as_categorical(kid_data, "FatherQualification")
mark_as_categorical(kid_data, "MotherOccupation")
mark_as_categorical(kid_data, "FatherOccupation")
target = kid_data.get("Target").replace(['Dropout', 'Graduate', 'Enrolled'], [0, 1, 1]).astype(float),


for col in kid_data.select_dtypes(include="float64", exclude="category"):
    kid_data[col] /= kid_data[col].max()

expanded_X, group_idxs = create_groups_from_1hot(kid_data.drop(columns="Target"))
list(group_idxs.values())

[array([26, 27, 28, 29, 30, 31]),
 array([32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
        49]),
 array([50, 51, 52, 53, 54, 55, 56, 57]),
 array([0]),
 array([58, 59]),
 array([60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]),
 array([1]),
 array([77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93,
        94, 95, 96, 97]),
 array([ 98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
        111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
        124, 125, 126]),
 array([127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
        153, 154, 155, 156, 157, 158, 159, 160]),
 array([161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
        174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
        187, 188, 189, 190, 191, 192]),
 array([193, 194, 195, 196, 197, 198, 199, 200, 201, 202,

In [6]:
kid_data.get("Target").value_counts()

Graduate    2209
Dropout     1421
Enrolled     794
Name: Target, dtype: int64

In [7]:
get_categories(kid_data)

['MaritalStatus',
 'ApplicationMode',
 'ApplicationOrder',
 'TimeOfDay',
 'PreviousQualification',
 'Nationality',
 'MotherQualification',
 'FatherQualification',
 'MotherOccupation',
 'FatherOccupation']

In [47]:
from groupyr import LogisticSGLCV

X = expanded_X.to_numpy(np.float64)
y = kid_data['Target'].replace(['Dropout', 'Graduate', 'Enrolled'], [0, 1, 1]).to_numpy(np.float64)

print(X.shape)
print(y.shape)

model = LogisticSGL(groups=list(group_idxs.values()), l1_ratio=1, alpha=0.0001,
    verbose=2, suppress_solver_warnings=False, max_iter=10000
).fit(X, y)



(4424, 239)
(4424,)


In [48]:
model.score(X, y)

0.8899186256781193

In [33]:
np.mean(y)

0.6787974683544303

In [46]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='none', max_iter=10000).fit(X, y)
model.score(X,y)

0.8928571428571429