# Prepare data

In [1]:
input_dir = "./data"
raw_input_dir = "."

In [2]:
import pandas as pd
import numpy as np
import os

df_raw = pd.read_csv(os.path.join(raw_input_dir, "student-performance-data/student_data.csv"))
df_raw = df_raw.drop(["G1", "G2"], axis=1)

with pd.option_context('display.max_columns', 400):
    display(df_raw.head())

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,10


In [3]:
from fca_utils import common_prepare

students_target_variable = "G3"
students_bool_cols = ["schoolsup", "famsup", "paid", "activities", "nursery", "higher", "internet", "romantic"]
students_int_cols = ["age", "Medu", "Fedu", "traveltime", "studytime", "failures", "famrel", "freetime", "goout", "Dalc", "Walc", "health", "absences"]
students_cat_cols = ["school", "sex", "address", "famsize", "Pstatus", "Mjob", "Fjob", "reason", "guardian"]

move_cols_to_bool = ["sex", "address", "school", "famsize", "Pstatus"]

for col in move_cols_to_bool:
    students_bool_cols.append(col) 
    students_cat_cols.remove(col)

df = common_prepare(df_raw, 
                    bool_columns=students_bool_cols, 
                    categorical_cols=students_cat_cols, 
                    ordinal_columns=students_int_cols,
                    y_target=students_target_variable,
                    bool_subs={'F':1, 'M':0, 'yes':1, 'no':0, 'GT3': 1, 'LE3': 0, 'GP': 1, 'MS': 0, 'U': 1, 'R': 0, 'A': 1, 'T': 0}
) 

df.head()

Unnamed: 0,G3,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,...,activities,nursery,higher,internet,romantic,sex,address,school,famsize,Pstatus
0,0,18,4,4,2,2,0,4,3,4,...,0,1,1,0,0,1,1,1,1,1
1,0,17,1,1,1,2,0,5,3,3,...,0,0,1,1,0,1,1,1,1,0
2,0,15,1,1,1,2,3,4,3,2,...,0,1,1,1,0,1,1,1,0,0
3,1,15,4,2,1,3,0,3,2,2,...,1,1,1,1,1,1,1,1,1,0
4,0,16,3,3,1,2,0,4,3,2,...,0,1,1,0,0,1,1,1,1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   G3          395 non-null    int64 
 1   age         395 non-null    int64 
 2   Medu        395 non-null    int64 
 3   Fedu        395 non-null    int64 
 4   traveltime  395 non-null    int64 
 5   studytime   395 non-null    int64 
 6   failures    395 non-null    int64 
 7   famrel      395 non-null    int64 
 8   freetime    395 non-null    int64 
 9   goout       395 non-null    int64 
 10  Dalc        395 non-null    int64 
 11  Walc        395 non-null    int64 
 12  health      395 non-null    int64 
 13  absences    395 non-null    int64 
 14  Mjob        395 non-null    object
 15  Fjob        395 non-null    object
 16  reason      395 non-null    object
 17  guardian    395 non-null    object
 18  schoolsup   395 non-null    int64 
 19  famsup      395 non-null    int64 
 20  paid      

# Binary FCA

In [5]:
from itertools import product
import fcalc

In [6]:
from fca_utils import build_configs

param_grid = {
    'onehot_cols': [students_cat_cols],
    'factorize_cols': [[]],
    'qcut_cols': [
        {},
        dict.fromkeys(students_int_cols, 2),
        dict.fromkeys(students_int_cols, 4),
        dict.fromkeys(students_int_cols, 10),
    ],
    'binarize_all': [True],
}

configs = build_configs(param_grid)

methods = [
    'standard',
    'standard-support',
    'ratio-support'
]

In [7]:
from fca_utils import grid_search

best_binary_config, best_binary_intersecs, best_binary_score = grid_search(
    df,
    fcalc.classifier.BinarizedBinaryClassifier,
    y_target=students_target_variable,
    n_folds=5,
    methods=methods,
    configs=configs,
)

Fitting 12 configurations
1
classifier=<class 'fcalc.classifier.BinarizedBinaryClassifier'>
config={'onehot_cols': ['Mjob', 'Fjob', 'reason', 'guardian'], 'factorize_cols': [], 'qcut_cols': {}, 'binarize_all': True}
method='standard'
f1_macro (mean)=0.558628587781754
f1_cls_1=[0.4827586206896552, 0.36363636363636365, 0.3529411764705882, 0.3939393939393939, 0.5]
f1_cls_0=[0.7, 0.7543859649122807, 0.6915887850467289, 0.5652173913043479, 0.7818181818181819]
f1_macro=[0.5913793103448276, 0.5590111642743222, 0.5222649807586586, 0.4795783926218709, 0.6409090909090909]

2
classifier=<class 'fcalc.classifier.BinarizedBinaryClassifier'>
config={'onehot_cols': ['Mjob', 'Fjob', 'reason', 'guardian'], 'factorize_cols': [], 'qcut_cols': {'age': 2, 'Medu': 2, 'Fedu': 2, 'traveltime': 2, 'studytime': 2, 'failures': 2, 'famrel': 2, 'freetime': 2, 'goout': 2, 'Dalc': 2, 'Walc': 2, 'health': 2, 'absences': 2}, 'binarize_all': True}
method='standard'
f1_macro (mean)=0.5173390354715953
f1_cls_1=[0.4666666

In [8]:
from collections import Counter

print("Best config for binary")
print(best_binary_config)
print()
print("Most important intersections (positive):")
print("\n".join(map(str, best_binary_intersecs[0].most_common(10))))
print()
print("Most important intersections (negative):")
print("\n".join(map(str, best_binary_intersecs[1].most_common(10))))
print()
print("Best f1_macro score for binary")
print(best_binary_score)

Best config for binary
('standard-support', {'onehot_cols': ['Mjob', 'Fjob', 'reason', 'guardian'], 'factorize_cols': [], 'qcut_cols': {'age': 2, 'Medu': 2, 'Fedu': 2, 'traveltime': 2, 'studytime': 2, 'failures': 2, 'famrel': 2, 'freetime': 2, 'goout': 2, 'Dalc': 2, 'Walc': 2, 'health': 2, 'absences': 2}, 'binarize_all': True})

Most important intersections (positive):
(('nursery', 'higher', 'internet', 'address', 'school', 'famsize', 'Fedu'), 63)
(('nursery', 'higher', 'internet', 'address', 'school', 'famsize', 'Mjob_other', 'Fjob_other'), 42)
(('nursery', 'higher', 'internet', 'address', 'school', 'famsize'), 37)
(('nursery', 'higher', 'internet', 'address', 'school', 'famsize', 'Fedu', 'Fjob_other'), 34)
(('nursery', 'higher', 'internet', 'address', 'school', 'guardian_mother'), 25)
(('nursery', 'higher', 'internet', 'address', 'school'), 24)
(('nursery', 'higher', 'internet', 'address', 'school', 'Fedu'), 22)
(('nursery', 'higher', 'internet', 'sex', 'address', 'school', 'famsize'

# Pattern FCA

In [9]:
param_grid = {
    'onehot_cols': [[], students_cat_cols],
    'factorize_cols': [[]],
    'qcut_cols': [
        {},
        dict.fromkeys(students_int_cols, 2),
        dict.fromkeys(students_int_cols, 4),
    ],
    'binarize_all': [False],
}

configs = build_configs(param_grid)

methods = [
    'standard',
    'standard-support',
    'ratio-support'
]

In [10]:
best_pattern_config, best_pattern_intersectinos, best_pattern_score = grid_search(
    df,
    fcalc.classifier.PatternBinaryClassifier,
    y_target=students_target_variable,
    n_folds=5,
    methods=methods,
    configs=configs,
    categorical=students_cat_cols
)

Fitting 18 configurations
1
classifier=<class 'fcalc.classifier.PatternBinaryClassifier'>
config={'onehot_cols': [], 'factorize_cols': [], 'qcut_cols': {}, 'binarize_all': False}
method='standard'
f1_macro (mean)=0.5682156302363995
f1_cls_1=[0.4761904761904762, 0.456140350877193, 0.5, 0.40625000000000006, 0.4528301886792452]
f1_cls_0=[0.6526315789473683, 0.693069306930693, 0.7254901960784315, 0.5957446808510639, 0.7238095238095239]
f1_macro=[0.5644110275689223, 0.574604828903943, 0.6127450980392157, 0.500997340425532, 0.5883198562443845]

2
classifier=<class 'fcalc.classifier.PatternBinaryClassifier'>
config={'onehot_cols': [], 'factorize_cols': [], 'qcut_cols': {'age': 2, 'Medu': 2, 'Fedu': 2, 'traveltime': 2, 'studytime': 2, 'failures': 2, 'famrel': 2, 'freetime': 2, 'goout': 2, 'Dalc': 2, 'Walc': 2, 'health': 2, 'absences': 2}, 'binarize_all': False}
method='standard'
f1_macro (mean)=0.6242951347367678
f1_cls_1=[0.5625000000000001, 0.5172413793103449, 0.47058823529411764, 0.55384615

In [11]:
print("Best config for binary")
print(best_pattern_config)
print()
print("Most important intersections (positive):")
print("\n".join(map(str, best_pattern_intersectinos[0].most_common(10))))
print()
print("Most important intersections (negative):")
print("\n".join(map(str, best_pattern_intersectinos[1].most_common(10))))
print()
print("Best f1_macro score for binary")
print(best_pattern_score)

Best config for binary
('standard', {'onehot_cols': [], 'factorize_cols': [], 'qcut_cols': {'age': 2, 'Medu': 2, 'Fedu': 2, 'traveltime': 2, 'studytime': 2, 'failures': 2, 'famrel': 2, 'freetime': 2, 'goout': 2, 'Dalc': 2, 'Walc': 2, 'health': 2, 'absences': 2}, 'binarize_all': False})

Most important intersections (positive):
(('schoolsup', 'nursery', 'higher', 'internet', 'romantic', 'sex', 'address', 'school', 'Pstatus', 'age', 'Fedu', 'studytime', 'famrel', 'goout', 'Walc', 'Fjob'), 4)
(('schoolsup', 'famsup', 'paid', 'nursery', 'higher', 'internet', 'sex', 'address', 'school', 'famsize', 'age', 'famrel', 'goout', 'Walc', 'absences'), 4)
(('schoolsup', 'famsup', 'paid', 'higher', 'internet', 'sex', 'address', 'school', 'Pstatus', 'age', 'Medu', 'Fedu', 'studytime', 'goout', 'health'), 4)
(('schoolsup', 'paid', 'nursery', 'higher', 'internet', 'romantic', 'sex', 'address', 'school', 'famsize', 'Pstatus', 'age', 'Fedu', 'studytime', 'freetime', 'goout', 'Walc', 'absences'), 4)
(('sch