In [1]:
import pandas as pd
import os
from src.pre_processing.macros import column_groups, DATA_PREPROC_PATH, DATA_SPLIT_PATH, ORIGINAL_DATASET_NAME
import numpy as np
from typing import List
from src.pre_processing.utils import aggregate_features, features_to_drop_after_aggregation

## Utils

In [2]:
def check_corr(df, col, thr=0.6):
    corr = df[df.columns[1:]].corr()[col][:-1]
    idxs = np.where((corr.values.flatten() >= thr) | (corr.values.flatten() < -thr))[0]
    if len(idxs) > 0:
        if bool(set(column_groups).intersection(set(idxs.flatten()))): 
            print(f"Column {col} exhibits non ignorable correlation with at least one of the scores (thr = {thr}). Don't drop it!")
        else: 
            print("Ok")
    else: 
            print("Ok")

In [3]:
def print_statistics(df, columns):
    print("MISSING VALUES")
    # print(df[columns].isna().sum())
    # print()
    # print("Percentage")
    print(df[columns].isna().sum() / df.shape[0] * 100)
    # print()
    # print("DISTINCT VALUES")
    # print(df[columns].value_counts())
    # print()

    corr = df[
        columns
        + [
            "score_MAT",
            "level_MAT",
            "score_LEN",
            "level_LEN",
            "score_ING",
            "level_ING",
        ]
    ].corr()
    
    return corr

def sorted_correlations(corr, end):
    res = list(corr[(corr != 1.0) & (corr >= 0.55)].loc[:end, :end].fillna(0).to_numpy().flatten())
    res = [x for x in res if x != 0]
    return sorted(list(set(res)), reverse=True)

def compare_cols(corr, col1, col2):
    res = (corr.loc[col1, "score_MAT":] - corr.loc[col2, "score_MAT":]).sum()
    if res >= 0:
        print(f"Drop {col2}")
    else:
        print(f"Drop {col1}")

## Load Data

In [4]:
df_teacher_questionnaire = pd.read_csv(os.path.join(DATA_SPLIT_PATH, "teacher_questionnaire.csv"), low_memory=False)
df_teacher_questionnaire = df_teacher_questionnaire.set_index("id_student")
df_teacher_questionnaire

Unnamed: 0_level_0,p2,p2n,p3n,p4n,p5,p6n,p7an,p7bn,p7cn,p7dn,...,p331a,p331b,p331c,p331d,p331e,p331f,p331g,p331j,pfc,rep
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,2.0,56.0,29.0,18.0,2.0,26.0,,,1.0,,...,2.0,2.0,3.0,2.0,3.0,3.0,,,,
5,2.0,44.0,20.0,17.0,2.0,26.0,0.0,0.0,0.0,0.0,...,4.0,,4.0,4.0,4.0,3.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2.0,34.0,1.0,1.0,,,3.0,2.0,1.0,1.0,...,4.0,4.0,4.0,4.0,4.0,,5.0,3.0,10.0,2.0
83854,1.0,54.0,30.0,30.0,,,0.0,0.0,0.0,0.0,...,3.0,4.0,4.0,4.0,4.0,,5.0,4.0,3.0,2.0
83855,,,,,,,,,,,...,,,,,,,,,,
83856,2.0,56.0,22.0,4.0,,,1.0,1.0,1.0,0.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,8.0,1.0


In [5]:
df_student_scores = pd.read_csv(os.path.join(DATA_SPLIT_PATH, "student_scores.csv"), low_memory=False)
df_student_scores = df_student_scores.set_index("id_student")
df_student_scores

Unnamed: 0_level_0,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,564.8700,3.0,535.1500,3.0,,
2,388.3400,1.0,293.7000,1.0,,
3,386.5900,1.0,514.8100,3.0,,
4,487.7600,2.0,449.2500,2.0,,
5,709.7900,4.0,598.7200,3.0,,
...,...,...,...,...,...,...
83853,400.8625,2.0,446.6522,2.0,294.7474,1.0
83854,597.0243,3.0,632.6043,4.0,633.2966,4.0
83855,707.9254,4.0,400.2761,2.0,477.5056,2.0
83856,522.8511,3.0,656.1601,4.0,540.1122,3.0


In [6]:
df = pd.concat([df_teacher_questionnaire, df_student_scores], axis=1)
df

Unnamed: 0_level_0,p2,p2n,p3n,p4n,p5,p6n,p7an,p7bn,p7cn,p7dn,...,p331g,p331j,pfc,rep,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,564.8700,3.0,535.1500,3.0,,
2,,,,,,,,,,,...,,,,,388.3400,1.0,293.7000,1.0,,
3,,,,,,,,,,,...,,,,,386.5900,1.0,514.8100,3.0,,
4,2.0,56.0,29.0,18.0,2.0,26.0,,,1.0,,...,,,,,487.7600,2.0,449.2500,2.0,,
5,2.0,44.0,20.0,17.0,2.0,26.0,0.0,0.0,0.0,0.0,...,,,,,709.7900,4.0,598.7200,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2.0,34.0,1.0,1.0,,,3.0,2.0,1.0,1.0,...,5.0,3.0,10.0,2.0,400.8625,2.0,446.6522,2.0,294.7474,1.0
83854,1.0,54.0,30.0,30.0,,,0.0,0.0,0.0,0.0,...,5.0,4.0,3.0,2.0,597.0243,3.0,632.6043,4.0,633.2966,4.0
83855,,,,,,,,,,,...,,,,,707.9254,4.0,400.2761,2.0,477.5056,2.0
83856,2.0,56.0,22.0,4.0,,,1.0,1.0,1.0,0.0,...,3.0,3.0,8.0,1.0,522.8511,3.0,656.1601,4.0,540.1122,3.0


## Correlations with missingness patterns

In [3]:
m = df_teacher_questionnaire.isna()
m.shape

(83857, 158)

In [4]:
m_cols = [col + ".1" for col in m.columns]
m_cols

['p2.1',
 'p2n.1',
 'p3n.1',
 'p4n.1',
 'p5.1',
 'p6n.1',
 'p7an.1',
 'p7bn.1',
 'p7cn.1',
 'p7dn.1',
 'p7en.1',
 'p7fn.1',
 'p7gn.1',
 'p8an.1',
 'p8bn.1',
 'p9a.1',
 'p9b.1',
 'p9c.1',
 'p9d.1',
 'p9e.1',
 'p9f.1',
 'p10n.1',
 'p11.1',
 'p12a.1',
 'p12b.1',
 'p12c.1',
 'p12d.1',
 'p13.1',
 'p13b.1',
 'p13c.1',
 'p15a.1',
 'p15b.1',
 'p15c.1',
 'p15d.1',
 'p15e.1',
 'p15f.1',
 'p15g.1',
 'p15h.1',
 'p15i.1',
 'p16a.1',
 'p16b.1',
 'p16c.1',
 'p16d.1',
 'p16e.1',
 'p16f.1',
 'p16g.1',
 'p16h.1',
 'p18a.1',
 'p18b.1',
 'p18c.1',
 'p18d.1',
 'p18e.1',
 'p18f.1',
 'p18g.1',
 'p18h.1',
 'p18i.1',
 'p19.1',
 'p20.1',
 'p21a.1',
 'p21b.1',
 'p21c.1',
 'p21d.1',
 'p21e.1',
 'p21f.1',
 'p22a.1',
 'p22b.1',
 'p22c.1',
 'p22d.1',
 'p22e.1',
 'p22f.1',
 'p22g.1',
 'p23a.1',
 'p23b.1',
 'p23c.1',
 'p23d.1',
 'p23e.1',
 'p23f.1',
 'p23g.1',
 'p23h.1',
 'p23i.1',
 'p24a.1',
 'p24b.1',
 'p24c.1',
 'p24d.1',
 'p24e.1',
 'p24f.1',
 'p24g.1',
 'p24h.1',
 'p24i.1',
 'p24j.1',
 'p24k.1',
 'p25.1',
 'p26.1

In [5]:
new_cols_mapper = {}
for col, new_col in zip(m.columns, m_cols):
    new_cols_mapper[col] = new_col
new_cols_mapper

{'p2': 'p2.1',
 'p2n': 'p2n.1',
 'p3n': 'p3n.1',
 'p4n': 'p4n.1',
 'p5': 'p5.1',
 'p6n': 'p6n.1',
 'p7an': 'p7an.1',
 'p7bn': 'p7bn.1',
 'p7cn': 'p7cn.1',
 'p7dn': 'p7dn.1',
 'p7en': 'p7en.1',
 'p7fn': 'p7fn.1',
 'p7gn': 'p7gn.1',
 'p8an': 'p8an.1',
 'p8bn': 'p8bn.1',
 'p9a': 'p9a.1',
 'p9b': 'p9b.1',
 'p9c': 'p9c.1',
 'p9d': 'p9d.1',
 'p9e': 'p9e.1',
 'p9f': 'p9f.1',
 'p10n': 'p10n.1',
 'p11': 'p11.1',
 'p12a': 'p12a.1',
 'p12b': 'p12b.1',
 'p12c': 'p12c.1',
 'p12d': 'p12d.1',
 'p13': 'p13.1',
 'p13b': 'p13b.1',
 'p13c': 'p13c.1',
 'p15a': 'p15a.1',
 'p15b': 'p15b.1',
 'p15c': 'p15c.1',
 'p15d': 'p15d.1',
 'p15e': 'p15e.1',
 'p15f': 'p15f.1',
 'p15g': 'p15g.1',
 'p15h': 'p15h.1',
 'p15i': 'p15i.1',
 'p16a': 'p16a.1',
 'p16b': 'p16b.1',
 'p16c': 'p16c.1',
 'p16d': 'p16d.1',
 'p16e': 'p16e.1',
 'p16f': 'p16f.1',
 'p16g': 'p16g.1',
 'p16h': 'p16h.1',
 'p18a': 'p18a.1',
 'p18b': 'p18b.1',
 'p18c': 'p18c.1',
 'p18d': 'p18d.1',
 'p18e': 'p18e.1',
 'p18f': 'p18f.1',
 'p18g': 'p18g.1',
 'p18h

In [6]:
m.rename(new_cols_mapper, axis=1)

Unnamed: 0_level_0,p2.1,p2n.1,p3n.1,p4n.1,p5.1,p6n.1,p7an.1,p7bn.1,p7cn.1,p7dn.1,...,p331a.1,p331b.1,p331c.1,p331d.1,p331e.1,p331f.1,p331g.1,p331j.1,pfc.1,rep.1
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,False,False,False,False,False,False,True,True,False,True,...,False,False,False,False,False,False,True,True,True,True
5,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
83854,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
83855,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
83856,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
combined_df = pd.concat([df_teacher_questionnaire, m.rename(new_cols_mapper)], axis=1)
combined_df.shape

(83857, 316)

In [16]:
combined_df.columns

Index(['p2', 'p2n', 'p3n', 'p4n', 'p5', 'p6n', 'p7an', 'p7bn', 'p7cn', 'p7dn',
       ...
       'p331a', 'p331b', 'p331c', 'p331d', 'p331e', 'p331f', 'p331g', 'p331j',
       'pfc', 'rep'],
      dtype='object', length=316)

In [17]:
combined_df["p2.1"]

KeyError: 'p2.1'

In [9]:
corr_matrix = combined_df.corr()

In [11]:
corr_matrix.columns

Index(['p2', 'p2n', 'p3n', 'p4n', 'p5', 'p6n', 'p7an', 'p7bn', 'p7cn', 'p7dn',
       ...
       'p331a', 'p331b', 'p331c', 'p331d', 'p331e', 'p331f', 'p331g', 'p331j',
       'pfc', 'rep'],
      dtype='object', length=316)

In [12]:
corr_matrix

Unnamed: 0,p2,p2n,p3n,p4n,p5,p6n,p7an,p7bn,p7cn,p7dn,...,p331a,p331b,p331c,p331d,p331e,p331f,p331g,p331j,pfc,rep
p2,1.000000,0.080206,0.099729,0.044715,-0.012962,-0.031842,0.049481,0.068592,0.029590,0.105826,...,-0.015190,0.024595,0.010756,-0.008424,-0.009150,0.029861,0.054787,0.055823,0.056290,0.054915
p2n,0.080206,1.000000,0.815024,0.417862,-0.134363,-0.086278,0.094769,0.034282,0.021752,0.036072,...,0.015678,0.047339,0.015023,0.004472,0.016851,-0.043066,-0.007001,0.001116,0.002104,0.001780
p3n,0.099729,0.815024,1.000000,0.622125,-0.165391,-0.009889,0.036737,0.041245,0.004570,0.012421,...,0.007595,0.058812,0.009750,0.008944,0.010597,-0.034399,0.059582,0.070256,0.069909,0.069781
p4n,0.044715,0.417862,0.622125,1.000000,-0.186445,0.165511,-0.108491,0.026666,-0.043208,-0.074481,...,-0.021543,-0.023479,-0.013877,0.008674,-0.008126,0.047295,-0.003227,-0.019077,-0.019297,-0.018628
p5,-0.012962,-0.134363,-0.165391,-0.186445,1.000000,-0.041279,-0.029061,-0.044885,-0.055919,-0.072837,...,-0.000463,-0.010201,0.008930,-0.012395,0.009255,0.024638,-0.018832,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p331f,0.029861,-0.043066,-0.034399,0.047295,0.024638,0.006527,-0.074389,-0.049738,-0.018033,-0.083541,...,0.799686,0.611871,0.772125,0.789863,0.811725,1.000000,0.283106,0.243458,0.242648,0.236270
p331g,0.054787,-0.007001,0.059582,-0.003227,-0.018832,-0.048618,-0.004612,0.134852,0.014881,0.167259,...,0.377822,0.297191,0.375761,0.378185,0.403497,0.283106,1.000000,0.840905,0.839842,0.833642
p331j,0.055823,0.001116,0.070256,-0.019077,,,-0.034078,0.106929,-0.042965,0.106975,...,0.455924,0.270250,0.440640,0.446665,0.455181,0.243458,0.840905,1.000000,0.998837,0.992644
pfc,0.056290,0.002104,0.069909,-0.019297,,,-0.034502,0.107535,-0.042584,0.106555,...,0.455146,0.269241,0.439692,0.445718,0.454403,0.242648,0.839842,0.998837,1.000000,0.992425


In [10]:
corr_matrix.loc[:, m_cols]

KeyError: "None of [Index(['p2.1', 'p2n.1', 'p3n.1', 'p4n.1', 'p5.1', 'p6n.1', 'p7an.1', 'p7bn.1',\n       'p7cn.1', 'p7dn.1',\n       ...\n       'p331a.1', 'p331b.1', 'p331c.1', 'p331d.1', 'p331e.1', 'p331f.1',\n       'p331g.1', 'p331j.1', 'pfc.1', 'rep.1'],\n      dtype='object', length=158)] are in the [columns]"

## Drop columns with too many missing values

In [74]:
cols_to_check = ["p7fn", "p27a", "p16h", "p19", "p23i", "p32e", "p41d", "p41e", "p41f", "p41j", "p299d", "p331j"]
for col in cols_to_check:
    check_corr(df, col)


Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok
Ok


In [72]:
class_problems = ["p26a",
                      "p26b",
                      "p26c",
                      "p26d"]

pfc_topics = ["p15a",
              "p15b",
              "p15c",
              "p15d",
              "p15e",
              "p15f",
              "p15g",
              "p15h",
              "p15i"]

cols_to_drop = ["p27a", "p16h", "p19", "p23i", "p32e", "p41d", "p41e", "p41f", "p41j", "p299d", "p331j"]
cols_to_drop = [x for x in cols_to_drop if x not in pfc_topics and x not in class_problems]
cols_to_drop

['p27a',
 'p16h',
 'p19',
 'p23i',
 'p32e',
 'p41d',
 'p41e',
 'p41f',
 'p41j',
 'p299d',
 'p331j']

## Drop redundant features with too many missing values

In [20]:
# drop redundant features that have the most NaNs
to_drop = "p5" if df["p5"].isna().sum() > df["rep"].isna().sum() else "rep"
df = df.drop(to_drop, axis=1, inplace=False)

In [21]:
pfc_topics = ["p15a",
              "p15b",
              "p15c",
              "p15d",
              "p15e",
              "p15f",
              "p15g",
              "p15h",
              "p15i"]

df = df.drop(pfc_topics, axis=1, inplace=False)

In [37]:
behaviour_problems = ["p26a",
                      "p26b",
                      "p26c",
                      "p26d"]
df[behaviour_problems].isna().sum()

p26a    56699
p26b    56699
p26c    56535
p26d    56522
dtype: int64

## Functional dependencies

### Special attention students (mean)

In [45]:
column_number = 7
column_letters = ["a", "b", "c", "d", "e", "f", "g"]
current_columns = [f"p{column_number}{letter}n" for letter in column_letters]

In [46]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p7an    50.712523
p7bn    53.728371
p7cn    60.964499
p7dn    58.525824
p7en    60.169097
p7fn    83.960790
p7gn    71.428742
dtype: float64


Unnamed: 0,p7an,p7bn,p7cn,p7dn,p7en,p7fn,p7gn,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p7an,1.0,0.265371,0.233965,0.285499,0.092062,0.323359,0.044606,-0.061899,-0.062408,-0.050746,-0.046957,-0.098805,-0.093426
p7bn,0.265371,1.0,0.139539,0.124721,0.109343,0.286822,0.000812,-0.018263,-0.017476,-0.028055,-0.025097,-0.01681,-0.019146
p7cn,0.233965,0.139539,1.0,0.216822,0.066518,0.348394,0.030122,-0.044203,-0.042038,-0.026071,-0.023384,-0.044493,-0.042863
p7dn,0.285499,0.124721,0.216822,1.0,0.034825,0.472999,0.046741,-0.059372,-0.058593,-0.046685,-0.047472,-0.105293,-0.107111
p7en,0.092062,0.109343,0.066518,0.034825,1.0,0.123361,0.041487,0.033366,0.031415,0.028859,0.028307,0.094865,0.086725
p7fn,0.323359,0.286822,0.348394,0.472999,0.123361,1.0,,-0.153473,-0.142265,-0.129055,-0.121591,-0.108457,-0.107698
p7gn,0.044606,0.000812,0.030122,0.046741,0.041487,,1.0,-0.006651,-0.005076,-0.008939,-0.004884,-0.007685,-0.006367
score_MAT,-0.061899,-0.018263,-0.044203,-0.059372,0.033366,-0.153473,-0.006651,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,-0.062408,-0.017476,-0.042038,-0.058593,0.031415,-0.142265,-0.005076,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,-0.050746,-0.028055,-0.026071,-0.046685,0.028859,-0.129055,-0.008939,0.478365,0.456791,1.0,0.94985,0.503339,0.482328


### Class behaviour (mean)

In [47]:
column_number = 12
column_letters = ["a", "b", "c", "d"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [48]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p12a    35.710793
p12b    36.508580
p12c    35.840777
p12d    35.887284
dtype: float64


Unnamed: 0,p12a,p12b,p12c,p12d,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p12a,1.0,0.574407,0.587709,0.722659,0.121749,0.114547,0.13,0.12381,0.149163,0.14061
p12b,0.574407,1.0,0.480083,0.545989,0.11697,0.112115,0.122638,0.115889,0.137239,0.130149
p12c,0.587709,0.480083,1.0,0.649771,0.109657,0.103623,0.119646,0.112097,0.14995,0.139375
p12d,0.722659,0.545989,0.649771,1.0,0.128902,0.121353,0.133888,0.126598,0.145097,0.13572
score_MAT,0.121749,0.11697,0.109657,0.128902,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.114547,0.112115,0.103623,0.121353,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.13,0.122638,0.119646,0.133888,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.12381,0.115889,0.112097,0.126598,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,0.149163,0.137239,0.14995,0.145097,0.457797,0.435458,0.503339,0.481747,1.0,0.955457
level_ING,0.14061,0.130149,0.139375,0.13572,0.435502,0.416475,0.482328,0.46406,0.955457,1.0


In [49]:
compare_cols(corr, "p12a", "p12d")
# drop p12a

Drop p12a


In [50]:
compare_cols(corr, "p12d", "p12c")
# drop p12c

Drop p12c


### Results satisfaction (good/bad mix)

In [51]:
current_columns = ["p13", "p13b", "p13c"]
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p13     35.795461
p13b    67.423113
p13c    67.474391
dtype: float64


Unnamed: 0,p13,p13b,p13c,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p13,1.0,0.680966,-0.14258,0.114637,0.109428,0.116338,0.109805,0.145915,0.132385
p13b,0.680966,1.0,-0.108426,0.108629,0.105915,0.076596,0.075413,0.134815,0.124172
p13c,-0.14258,-0.108426,1.0,-0.025326,-0.02399,-0.028698,-0.026093,-0.054648,-0.047941
score_MAT,0.114637,0.108629,-0.025326,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.109428,0.105915,-0.02399,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.116338,0.076596,-0.028698,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.109805,0.075413,-0.026093,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,0.145915,0.134815,-0.054648,0.457797,0.435458,0.503339,0.481747,1.0,0.955457
level_ING,0.132385,0.124172,-0.047941,0.435502,0.416475,0.482328,0.46406,0.955457,1.0


In [52]:
compare_cols(corr, "p13", "p13b")
# drop p13b

Drop p13b


### PFC incidence (mean)

In [53]:
column_number = 16
column_letters = ["a", "b", "c", "d", "e", "f", "g"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [54]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p16a    43.939087
p16b    43.271283
p16c    42.937381
p16d    43.829376
p16e    43.633805
p16f    45.242496
p16g    43.521710
dtype: float64


Unnamed: 0,p16a,p16b,p16c,p16d,p16e,p16f,p16g,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p16a,1.0,0.641926,0.526808,0.590257,0.61095,0.646553,0.459679,0.036751,0.033417,0.043766,0.041851,0.040938,0.036755
p16b,0.641926,1.0,0.646565,0.648707,0.580111,0.60293,0.470827,0.048276,0.044783,0.055489,0.050681,0.066062,0.059436
p16c,0.526808,0.646565,1.0,0.638822,0.570884,0.545822,0.486422,0.062073,0.057004,0.060081,0.057387,0.078651,0.074092
p16d,0.590257,0.648707,0.638822,1.0,0.569889,0.617926,0.523247,0.037659,0.036464,0.051579,0.050468,0.066878,0.062883
p16e,0.61095,0.580111,0.570884,0.569889,1.0,0.648371,0.397561,0.011334,0.00966,0.016565,0.01585,0.002964,-0.00135
p16f,0.646553,0.60293,0.545822,0.617926,0.648371,1.0,0.51473,0.02838,0.025089,0.037495,0.03548,0.052331,0.04406
p16g,0.459679,0.470827,0.486422,0.523247,0.397561,0.51473,1.0,0.065463,0.05858,0.058413,0.05551,0.101707,0.093251
score_MAT,0.036751,0.048276,0.062073,0.037659,0.011334,0.02838,0.065463,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.033417,0.044783,0.057004,0.036464,0.00966,0.025089,0.05858,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.043766,0.055489,0.060081,0.051579,0.016565,0.037495,0.058413,0.478365,0.456791,1.0,0.94985,0.503339,0.482328


In [55]:
sorted_correlations(corr, end="p16g")

[0.6487073360662796,
 0.6483713691432736,
 0.6465652301849645,
 0.646553080728308,
 0.6419261875803667,
 0.6388220575107758,
 0.617925646693003,
 0.6109499792182331,
 0.6029301613583078,
 0.5902574128449529,
 0.5801112115640091,
 0.5708839561630694,
 0.5698894988206875]

In [56]:
compare_cols(corr, "p16b", "p16d")
# drop p16d

Drop p16d


In [57]:
compare_cols(corr, "p16e", "p16f")
# drop p16e

Drop p16e


In [58]:
compare_cols(corr, "p16b", "p16c")
# drop p16b

Drop p16b


In [59]:
compare_cols(corr, "p16a", "p16f")
# drop p16f

Drop p16f


### Individual training topics (sum)

In [60]:
column_number = 18
column_letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]


In [61]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p18a    58.251547
p18b    72.413752
p18c    82.482083
p18d    57.375055
p18e    80.652778
p18f    71.815114
p18g    74.885818
p18h    76.598257
p18i    84.376975
dtype: float64


Unnamed: 0,p18a,p18b,p18c,p18d,p18e,p18f,p18g,p18h,p18i,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p18a,1.0,0.519397,0.46589,0.53684,0.411692,0.53068,0.337793,0.557967,0.554583,-0.003784,-0.004987,0.002555,0.001925,0.004274,0.011898
p18b,0.519397,1.0,0.674601,0.622965,0.583008,0.657881,0.509709,0.504827,0.588447,-0.031126,-0.030442,-0.02086,-0.019823,-0.026951,-0.01438
p18c,0.46589,0.674601,1.0,0.483296,0.61839,0.667496,0.443932,0.451068,0.530575,-0.025567,-0.022057,-0.018793,-0.016496,-0.039784,-0.024705
p18d,0.53684,0.622965,0.483296,1.0,0.456528,0.537042,0.445718,0.504057,0.456174,-0.017006,-0.013132,-0.009947,-0.008854,-0.01939,-0.010731
p18e,0.411692,0.583008,0.61839,0.456528,1.0,0.524238,0.379123,0.448061,0.421472,-0.014915,-0.016378,-0.001181,-0.001473,-0.039578,-0.029438
p18f,0.53068,0.657881,0.667496,0.537042,0.524238,1.0,0.463972,0.496881,0.449061,0.010577,0.014798,0.021948,0.020177,0.01494,0.024448
p18g,0.337793,0.509709,0.443932,0.445718,0.379123,0.463972,1.0,0.333206,0.407985,-0.043938,-0.040506,-0.024616,-0.023089,-0.052232,-0.043644
p18h,0.557967,0.504827,0.451068,0.504057,0.448061,0.496881,0.333206,1.0,0.583969,-0.018336,-0.01603,0.006365,0.006802,-0.031573,-0.024095
p18i,0.554583,0.588447,0.530575,0.456174,0.421472,0.449061,0.407985,0.583969,1.0,-0.047276,-0.041184,0.008042,0.012857,-0.060127,-0.050633
score_MAT,-0.003784,-0.031126,-0.025567,-0.017006,-0.014915,0.010577,-0.043938,-0.018336,-0.047276,1.0,0.949125,0.478365,0.458407,0.457797,0.435502


In [62]:
sorted_correlations(corr, "p18i")

[0.674600671904287,
 0.6674955066290563,
 0.657881156614492,
 0.6229647811105893,
 0.6183901992737807,
 0.5884472364630129,
 0.5839690067573998,
 0.58300765153288,
 0.5579667339736581,
 0.5545826192917778]

In [63]:
compare_cols(corr, "p18b", "p18c")
# drop p18c

Drop p18c


In [64]:
compare_cols(corr, "p18b", "p18f")
# drop p18b

Drop p18b


### Student involvement during class (mean)

In [65]:
column_number = 21
column_letters = ["a", "b", "c", "d", "e", "f"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [66]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p21a    35.812156
p21b    36.035155
p21c    35.985070
p21d    35.864627
p21e    35.830044
p21f    35.847932
dtype: float64


Unnamed: 0,p21a,p21b,p21c,p21d,p21e,p21f,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p21a,1.0,0.322068,0.233585,0.192853,0.259087,0.096672,0.054844,0.051783,0.07081,0.066134,0.10285,0.098782
p21b,0.322068,1.0,0.289629,0.282104,0.247853,0.200089,0.056104,0.054123,0.06889,0.064955,0.113833,0.104622
p21c,0.233585,0.289629,1.0,0.554608,0.42069,0.230559,0.094278,0.091258,0.101741,0.095035,0.129954,0.121467
p21d,0.192853,0.282104,0.554608,1.0,0.474644,0.266237,0.10555,0.101689,0.115855,0.110082,0.12106,0.11338
p21e,0.259087,0.247853,0.42069,0.474644,1.0,0.184599,0.078909,0.076224,0.101613,0.095655,0.125495,0.11962
p21f,0.096672,0.200089,0.230559,0.266237,0.184599,1.0,0.011057,0.011334,0.026396,0.024932,0.038408,0.035042
score_MAT,0.054844,0.056104,0.094278,0.10555,0.078909,0.011057,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.051783,0.054123,0.091258,0.101689,0.076224,0.011334,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.07081,0.06889,0.101741,0.115855,0.101613,0.026396,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.066134,0.064955,0.095035,0.110082,0.095655,0.024932,0.458407,0.439989,0.94985,1.0,0.481747,0.46406


### Teaching methods variety (mean)

In [67]:
column_number = 22
column_letters = ["a", "b", "c", "d", "e", "f", "g"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [68]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p22a    35.738221
p22b    35.709601
p22c    35.923059
p22d    35.770419
p22e    35.719141
p22f    35.686943
p22g    35.727488
dtype: float64


Unnamed: 0,p22a,p22b,p22c,p22d,p22e,p22f,p22g,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p22a,1.0,0.586668,0.581164,0.415501,0.204661,0.224407,0.145519,0.044867,0.041116,0.047434,0.043839,0.070452,0.069227
p22b,0.586668,1.0,0.585757,0.42097,0.220158,0.196354,0.141063,0.020987,0.019444,0.045239,0.042024,0.040859,0.039449
p22c,0.581164,0.585757,1.0,0.475349,0.208633,0.194156,0.134138,0.033409,0.031721,0.04643,0.04377,0.07128,0.069342
p22d,0.415501,0.42097,0.475349,1.0,0.318873,0.219965,0.169884,0.026007,0.02203,0.039816,0.03806,0.043845,0.0468
p22e,0.204661,0.220158,0.208633,0.318873,1.0,0.493399,0.351631,0.043576,0.044307,0.05343,0.053169,0.048809,0.049527
p22f,0.224407,0.196354,0.194156,0.219965,0.493399,1.0,0.452815,0.050427,0.050583,0.054732,0.050773,0.067804,0.067179
p22g,0.145519,0.141063,0.134138,0.169884,0.351631,0.452815,1.0,0.036005,0.036248,0.048252,0.043895,0.055026,0.052786
score_MAT,0.044867,0.020987,0.033409,0.026007,0.043576,0.050427,0.036005,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.041116,0.019444,0.031721,0.02203,0.044307,0.050583,0.036248,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.047434,0.045239,0.04643,0.039816,0.05343,0.054732,0.048252,0.478365,0.456791,1.0,0.94985,0.503339,0.482328


In [69]:
compare_cols(corr, "p22a", "p22b")
# drop p22b

Drop p22b


In [70]:
compare_cols(corr, "p22a", "p22c")
# drop p22c

Drop p22c


### Resource variety (mean)

In [71]:
column_number = 23
column_letters = ["a", "b", "c", "d", "e", "f", "g", "h"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [72]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p23a    35.976722
p23b    35.920674
p23c    36.000572
p23d    36.227149
p23e    35.913519
p23f    36.001765
p23g    35.813349
p23h    42.758506
dtype: float64


Unnamed: 0,p23a,p23b,p23c,p23d,p23e,p23f,p23g,p23h,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p23a,1.0,0.247718,-0.003209,-0.124241,-0.117272,0.151091,-0.177083,-0.11692,0.001305,0.002453,2.4e-05,-0.000823,-0.003149,-0.002918
p23b,0.247718,1.0,0.280199,0.103603,0.129567,0.145749,0.116542,0.112494,0.026461,0.023508,0.021681,0.018079,-0.003516,-0.00545
p23c,-0.003209,0.280199,1.0,0.150899,0.161019,0.09641,0.163252,0.140005,0.012998,0.011963,0.010543,0.007654,-0.002561,-0.006249
p23d,-0.124241,0.103603,0.150899,1.0,0.455371,0.005033,0.192593,0.34892,0.03194,0.028058,0.045011,0.044229,0.084702,0.080242
p23e,-0.117272,0.129567,0.161019,0.455371,1.0,0.085058,0.302993,0.453891,0.046895,0.043229,0.043412,0.04246,0.078255,0.07717
p23f,0.151091,0.145749,0.09641,0.005033,0.085058,1.0,0.137117,0.125142,0.052982,0.050492,0.060979,0.056767,0.058766,0.056893
p23g,-0.177083,0.116542,0.163252,0.192593,0.302993,0.137117,1.0,0.163585,0.043429,0.038424,0.038359,0.03431,0.063501,0.05886
p23h,-0.11692,0.112494,0.140005,0.34892,0.453891,0.125142,0.163585,1.0,-0.004089,-0.002871,0.018767,0.016636,-0.008777,-0.008006
score_MAT,0.001305,0.026461,0.012998,0.03194,0.046895,0.052982,0.043429,-0.004089,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.002453,0.023508,0.011963,0.028058,0.043229,0.050492,0.038424,-0.002871,0.949125,1.0,0.456791,0.439989,0.435458,0.416475


### Evaluation variety (mean)

In [73]:
column_number = 24
column_letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [74]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p24a    35.874167
p24b    35.850317
p24c    36.076893
p24d    36.079278
p24e    35.969567
p24f    35.878937
p24g    36.000572
p24h    35.840777
p24i    35.895632
p24j    36.454917
p24k    36.006535
dtype: float64


Unnamed: 0,p24a,p24b,p24c,p24d,p24e,p24f,p24g,p24h,p24i,p24j,p24k,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p24a,1.0,0.124984,0.086256,0.213372,0.252932,0.188322,0.201854,0.241288,0.024716,0.234144,0.222312,0.024581,0.024328,0.041478,0.038467,0.035938,0.035874
p24b,0.124984,1.0,0.098175,0.234412,0.164864,0.064111,-0.066055,0.14969,0.120014,0.013118,0.013356,0.019938,0.020887,0.023437,0.021789,-0.009007,-0.007027
p24c,0.086256,0.098175,1.0,0.10137,0.064551,0.255851,0.143964,0.053222,0.155782,0.153279,0.067793,0.011341,0.011241,0.015989,0.017844,0.0023,0.000109
p24d,0.213372,0.234412,0.10137,1.0,0.539471,0.123165,0.079487,0.325113,0.102865,0.138333,0.109832,0.034015,0.035432,0.046318,0.045425,0.034202,0.031056
p24e,0.252932,0.164864,0.064551,0.539471,1.0,0.165243,0.125387,0.333515,0.071508,0.164681,0.189058,0.045932,0.04581,0.044457,0.045271,0.029958,0.02937
p24f,0.188322,0.064111,0.255851,0.123165,0.165243,1.0,0.346331,0.114049,0.082245,0.260804,0.183463,0.019903,0.018921,0.033434,0.033198,0.061801,0.055553
p24g,0.201854,-0.066055,0.143964,0.079487,0.125387,0.346331,1.0,0.171787,0.10456,0.506276,0.188415,0.029361,0.027368,0.015726,0.017129,0.028671,0.027804
p24h,0.241288,0.14969,0.053222,0.325113,0.333515,0.114049,0.171787,1.0,0.18967,0.234276,0.109018,0.041401,0.039983,0.045152,0.041946,0.039677,0.034398
p24i,0.024716,0.120014,0.155782,0.102865,0.071508,0.082245,0.10456,0.18967,1.0,0.248763,-0.022171,0.009126,0.007187,0.018217,0.01907,0.03694,0.030145
p24j,0.234144,0.013118,0.153279,0.138333,0.164681,0.260804,0.506276,0.234276,0.248763,1.0,0.286376,0.007042,0.005207,0.014993,0.018207,0.012983,0.013938


### Work hampered (mean)

In [75]:
column_number = 27
column_letters = ["b", "c", "d", "e", "f", "g", "h"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [76]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p27b    36.076893
p27c    36.266501
p27d    36.287966
p27e    36.128171
p27f    36.240266
p27g    36.171101
p27h    36.057813
dtype: float64


Unnamed: 0,p27b,p27c,p27d,p27e,p27f,p27g,p27h,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p27b,1.0,0.375993,0.485043,0.477154,0.303532,0.28553,0.205499,-0.044963,-0.042604,-0.040034,-0.036751,-0.065742,-0.058447
p27c,0.375993,1.0,0.499361,0.368679,0.279731,0.321394,0.263047,-0.07746,-0.073085,-0.06645,-0.062762,-0.079153,-0.074182
p27d,0.485043,0.499361,1.0,0.503796,0.35672,0.315571,0.232733,-0.067755,-0.063024,-0.065115,-0.062594,-0.072688,-0.067664
p27e,0.477154,0.368679,0.503796,1.0,0.392941,0.260924,0.164276,-0.054793,-0.051654,-0.062039,-0.058259,-0.048736,-0.04289
p27f,0.303532,0.279731,0.35672,0.392941,1.0,0.3452,0.238072,-0.043036,-0.039725,-0.062845,-0.058952,-0.048794,-0.045955
p27g,0.28553,0.321394,0.315571,0.260924,0.3452,1.0,0.428044,-0.04942,-0.044037,-0.067602,-0.066016,-0.069984,-0.06441
p27h,0.205499,0.263047,0.232733,0.164276,0.238072,0.428044,1.0,-0.026672,-0.025323,-0.045379,-0.046124,-0.046567,-0.042407
score_MAT,-0.044963,-0.07746,-0.067755,-0.054793,-0.043036,-0.04942,-0.026672,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,-0.042604,-0.073085,-0.063024,-0.051654,-0.039725,-0.044037,-0.025323,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,-0.040034,-0.06645,-0.065115,-0.062039,-0.062845,-0.067602,-0.045379,0.478365,0.456791,1.0,0.94985,0.503339,0.482328


### Family interest (mean)

In [77]:
column_number = 29
column_letters = ["a", "b", "c", "d", "e"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [78]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p29a    35.828851
p29b    35.888477
p29c    35.958835
p29d    36.013690
p29e    35.872974
dtype: float64


Unnamed: 0,p29a,p29b,p29c,p29d,p29e,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p29a,1.0,0.165985,0.188669,0.156716,-0.005032,-0.012562,-0.010722,0.003231,0.001809,-0.002851,-0.001961
p29b,0.165985,1.0,0.529003,0.259516,0.456059,0.0741,0.070069,0.081666,0.076077,0.133642,0.128467
p29c,0.188669,0.529003,1.0,0.494885,0.375404,0.036198,0.033354,0.040696,0.038807,0.066635,0.064081
p29d,0.156716,0.259516,0.494885,1.0,0.343698,-0.001578,-0.003575,0.002138,0.003001,0.036737,0.039246
p29e,-0.005032,0.456059,0.375404,0.343698,1.0,0.079551,0.076747,0.077321,0.073214,0.123834,0.115731
score_MAT,-0.012562,0.0741,0.036198,-0.001578,0.079551,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,-0.010722,0.070069,0.033354,-0.003575,0.076747,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.003231,0.081666,0.040696,0.002138,0.077321,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.001809,0.076077,0.038807,0.003001,0.073214,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,-0.002851,0.133642,0.066635,0.036737,0.123834,0.457797,0.435458,0.503339,0.481747,1.0,0.955457


### Family support (mean)

In [79]:
current_columns = ["p30a", "p30b", "p30c"]
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p30a    35.952872
p30b    35.948102
p30c    35.985070
dtype: float64


Unnamed: 0,p30a,p30b,p30c,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p30a,1.0,0.363324,0.466402,0.081049,0.079585,0.094686,0.089426,0.105075,0.094377
p30b,0.363324,1.0,0.432777,0.087228,0.082496,0.104601,0.100452,0.124477,0.112375
p30c,0.466402,0.432777,1.0,0.093444,0.091825,0.109301,0.103039,0.119307,0.110932
score_MAT,0.081049,0.087228,0.093444,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.079585,0.082496,0.091825,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.094686,0.104601,0.109301,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.089426,0.100452,0.103039,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,0.105075,0.124477,0.119307,0.457797,0.435458,0.503339,0.481747,1.0,0.955457
level_ING,0.094377,0.112375,0.110932,0.435502,0.416475,0.482328,0.46406,0.955457,1.0


### Opinion on school (mean)

In [80]:
column_number = 32
column_letters = ["a", "b", "c", "d"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [81]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p32a    36.177063
p32b    36.874680
p32c    36.240266
p32d    61.648998
dtype: float64


Unnamed: 0,p32a,p32b,p32c,p32d,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p32a,1.0,0.659026,0.711937,0.576139,0.086966,0.083534,0.077452,0.073795,0.071441,0.067717
p32b,0.659026,1.0,0.70542,0.548038,0.121106,0.115387,0.108177,0.102041,0.136816,0.128857
p32c,0.711937,0.70542,1.0,0.748772,0.119597,0.113397,0.099127,0.09501,0.116115,0.108678
p32d,0.576139,0.548038,0.748772,1.0,0.113154,0.106597,0.091171,0.084068,0.084974,0.080972
score_MAT,0.086966,0.121106,0.119597,0.113154,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.083534,0.115387,0.113397,0.106597,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.077452,0.108177,0.099127,0.091171,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.073795,0.102041,0.09501,0.084068,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,0.071441,0.136816,0.116115,0.084974,0.457797,0.435458,0.503339,0.481747,1.0,0.955457
level_ING,0.067717,0.128857,0.108678,0.080972,0.435502,0.416475,0.482328,0.46406,0.955457,1.0


In [82]:
sorted_correlations(corr, "p32d")

[0.7487720824859526,
 0.7119374836512194,
 0.7054196055204635,
 0.6590258663010227,
 0.5761386834739312]

In [83]:
compare_cols(corr, "p32c", "p32d")
# drop p32d

Drop p32d


In [84]:
compare_cols(corr, "p32a", "p32c")
# drop p32a

Drop p32a


In [85]:
compare_cols(corr, "p32b", "p32c")
# drop p32c

Drop p32c


### Work facilitated by management (mean)



In [86]:
column_number = 34
column_letters = ["a", "b", "c", "d", "e", "f", "g"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [87]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p34a    36.107898
p34b    36.126978
p34c    36.160368
p34d    36.229534
p34e    36.273656
p34f    36.295121
p34g    36.270079
dtype: float64


Unnamed: 0,p34a,p34b,p34c,p34d,p34e,p34f,p34g,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p34a,1.0,0.746903,0.718101,0.747193,0.743784,0.735578,0.646058,0.041276,0.038913,0.046306,0.045233,0.045995,0.042183
p34b,0.746903,1.0,0.721109,0.706088,0.711808,0.747157,0.625452,0.016196,0.016203,0.03081,0.030411,0.020504,0.019333
p34c,0.718101,0.721109,1.0,0.76323,0.777524,0.78626,0.715825,0.033736,0.031417,0.041338,0.039174,0.041001,0.03951
p34d,0.747193,0.706088,0.76323,1.0,0.801266,0.784469,0.689255,0.045411,0.043659,0.05752,0.054585,0.0655,0.061436
p34e,0.743784,0.711808,0.777524,0.801266,1.0,0.833051,0.718903,0.046712,0.043729,0.058016,0.055417,0.062472,0.058228
p34f,0.735578,0.747157,0.78626,0.784469,0.833051,1.0,0.718989,0.039058,0.037053,0.057841,0.055666,0.057154,0.05398
p34g,0.646058,0.625452,0.715825,0.689255,0.718903,0.718989,1.0,0.029895,0.026512,0.039219,0.035942,0.040989,0.038506
score_MAT,0.041276,0.016196,0.033736,0.045411,0.046712,0.039058,0.029895,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.038913,0.016203,0.031417,0.043659,0.043729,0.037053,0.026512,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.046306,0.03081,0.041338,0.05752,0.058016,0.057841,0.039219,0.478365,0.456791,1.0,0.94985,0.503339,0.482328


In [88]:
sorted_correlations(corr, "p34g")

[0.8330509066327076,
 0.8012655248917175,
 0.7862600523599624,
 0.7844694109280528,
 0.7775236300788035,
 0.7632302691061539,
 0.7471931110938814,
 0.7471568002319641,
 0.7469028206708052,
 0.7437840498917029,
 0.735578149198494,
 0.721108668868582,
 0.718988534260177,
 0.7189034599380715,
 0.7181007822449554,
 0.715824800057299,
 0.7118083718232544,
 0.7060876342456007,
 0.6892551251453595,
 0.6460579836257908,
 0.6254518485058429]

In [89]:
compare_cols(corr, "p34e", "p34f")

Drop p34f


In [90]:
compare_cols(corr, "p34d", "p34e")

Drop p34e


In [91]:
compare_cols(corr, "p34c", "p34d")

Drop p34c


In [92]:
compare_cols(corr, "p34a", "p34d")

Drop p34a


In [93]:
compare_cols(corr, "p34d", "p34g")

Drop p34g


### Satisfaction with job and school (good/bad mix)

In [94]:
column_number = 41
column_letters = ["a", "b", "c", "g", "h", "i"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [95]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p41a    67.482738
p41b    67.482738
p41c    67.483931
p41g    67.483931
p41h    67.507781
p41i    67.483931
dtype: float64


Unnamed: 0,p41a,p41b,p41c,p41g,p41h,p41i,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p41a,1.0,0.441842,-0.106335,0.263463,0.254913,0.22924,0.037244,0.032949,0.058214,0.057089,0.092047,0.084464
p41b,0.441842,1.0,-0.117842,0.229478,0.179265,0.305597,0.043905,0.039786,0.041285,0.037654,0.041379,0.034868
p41c,-0.106335,-0.117842,1.0,-0.481026,-0.03059,-0.183738,-0.079625,-0.079582,-0.052279,-0.04894,-0.082093,-0.078886
p41g,0.263463,0.229478,-0.481026,1.0,0.097406,0.299368,0.067091,0.061056,0.060995,0.053846,0.090969,0.084222
p41h,0.254913,0.179265,-0.03059,0.097406,1.0,0.075941,-0.010508,-0.006433,-0.002005,-0.001109,-0.001004,-0.005028
p41i,0.22924,0.305597,-0.183738,0.299368,0.075941,1.0,0.084533,0.075439,0.057517,0.056115,0.076392,0.0718
score_MAT,0.037244,0.043905,-0.079625,0.067091,-0.010508,0.084533,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.032949,0.039786,-0.079582,0.061056,-0.006433,0.075439,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.058214,0.041285,-0.052279,0.060995,-0.002005,0.057517,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.057089,0.037654,-0.04894,0.053846,-0.001109,0.056115,0.458407,0.439989,0.94985,1.0,0.481747,0.46406


### Positive relationships (mean)

In [96]:
column_number = 311
column_letters = ["a", "b", "c", "e", "f", "g", "h"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [97]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p311a    35.844354
p311b    35.927830
p311c    36.010112
p311e    36.122208
p311f    36.016075
p311g    35.988647
p311h    36.183026
dtype: float64


Unnamed: 0,p311a,p311b,p311c,p311e,p311f,p311g,p311h,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p311a,1.0,0.323287,0.52205,0.39607,0.284156,0.391579,0.272065,0.114963,0.105676,0.104601,0.097874,0.122619,0.114038
p311b,0.323287,1.0,0.495043,0.496855,0.659547,0.542448,0.570172,0.040607,0.035212,0.056536,0.054497,0.04117,0.038981
p311c,0.52205,0.495043,1.0,0.558743,0.492359,0.60366,0.469348,0.081007,0.074299,0.084827,0.078382,0.100687,0.096193
p311e,0.39607,0.496855,0.558743,1.0,0.619058,0.710906,0.553191,0.065241,0.057951,0.064196,0.059102,0.082113,0.073876
p311f,0.284156,0.659547,0.492359,0.619058,1.0,0.729856,0.731455,0.051124,0.046904,0.04836,0.045755,0.039517,0.036125
p311g,0.391579,0.542448,0.60366,0.710906,0.729856,1.0,0.658615,0.053411,0.048627,0.055306,0.051398,0.063524,0.059229
p311h,0.272065,0.570172,0.469348,0.553191,0.731455,0.658615,1.0,0.031352,0.027215,0.025723,0.022886,0.02716,0.026338
score_MAT,0.114963,0.040607,0.081007,0.065241,0.051124,0.053411,0.031352,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.105676,0.035212,0.074299,0.057951,0.046904,0.048627,0.027215,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.104601,0.056536,0.084827,0.064196,0.04836,0.055306,0.025723,0.478365,0.456791,1.0,0.94985,0.503339,0.482328


In [98]:
sorted_correlations(corr, end="p311h")

[0.7314545719238009,
 0.7298562918172815,
 0.7109061952786386,
 0.6595474389810184,
 0.6586146919536507,
 0.6190582606192488,
 0.6036596973942983,
 0.5701717725109681,
 0.5587430060796954,
 0.5531908518556586]

In [99]:
compare_cols(corr, "p311f", "p311h")
# drop p311f, p311g

Drop p311h


In [100]:
compare_cols(corr, "p311f", "p311g")


Drop p311f


In [101]:
compare_cols(corr, "p311e", "p311g")


Drop p311g


### Work by non teachers (mean)

In [103]:
column_number = 331
column_letters = ["a", "b", "c", "d", "e", "f", "g"]
current_columns = [f"p{column_number}{letter}" for letter in column_letters]

In [104]:
corr = print_statistics(df=df, columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
p331a    40.668042
p331b    55.227351
p331c    41.990532
p331d    41.897516
p331e    40.207735
p331f    46.052208
p331g    67.574561
dtype: float64


Unnamed: 0,p331a,p331b,p331c,p331d,p331e,p331f,p331g,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
p331a,1.0,0.73242,0.753803,0.728387,0.486771,0.490293,0.218262,0.046684,0.043545,0.030683,0.028411,0.022062,0.018685
p331b,0.73242,1.0,0.673117,0.617616,0.429233,0.446545,0.315666,0.030308,0.027564,0.019178,0.020759,0.029542,0.024329
p331c,0.753803,0.673117,1.0,0.708336,0.487831,0.444843,0.186693,0.039873,0.036772,0.029278,0.02581,0.023573,0.020523
p331d,0.728387,0.617616,0.708336,1.0,0.52606,0.465672,0.193908,0.039512,0.037733,0.028727,0.025769,0.035163,0.032999
p331e,0.486771,0.429233,0.487831,0.52606,1.0,0.432716,0.219307,0.02219,0.022857,0.020993,0.018327,0.059327,0.053626
p331f,0.490293,0.446545,0.444843,0.465672,0.432716,1.0,0.547957,0.005897,0.005285,0.014863,0.012972,0.017695,0.009914
p331g,0.218262,0.315666,0.186693,0.193908,0.219307,0.547957,1.0,0.014163,0.013329,0.035028,0.031023,0.078862,0.071087
score_MAT,0.046684,0.030308,0.039873,0.039512,0.02219,0.005897,0.014163,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.043545,0.027564,0.036772,0.037733,0.022857,0.005285,0.013329,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.030683,0.019178,0.029278,0.028727,0.020993,0.014863,0.035028,0.478365,0.456791,1.0,0.94985,0.503339,0.482328


In [105]:
sorted_correlations(corr, "p331g")
# drop p331c, p331b, p331a

[0.7538034027997785,
 0.7324203783980622,
 0.7283874966568232,
 0.708335510520073,
 0.6731165450058448,
 0.6176160370343391]

In [106]:
compare_cols(corr, "p331c", "p331a")

Drop p331c


In [107]:
compare_cols(corr, "p331a", "p331b")

Drop p331b


In [108]:
compare_cols(corr, "p331d", "p331a")

Drop p331a


## Aggregate features - semanticly meaningful sub groups

Non binary features are aggregated through an average and missing values are replaced with zeros.

In [23]:
non_binary_feature_intervals = {
    "pfc_incidence": ["p16a", "p16b", "p16c", "p16d", "p16e", "p16f", "p16g"],
    "teacher_work_hampered": ["p27a", "p27b", "p27c", "p27c", "p27d", "p27e", "p27f", "p27g", "p27h"],
    "family_interest": ["p29a", "p29b", "p29c", "p29d", "p29e", "p30a", "p30b", "p30c"],
    "teacher_work_facilitated": ["p34a", "p34b", "p34c", "p34d", "p34e", "p34f", "p34g"],
    "satisfaction_with_teaching": ["p41a", "p41b", "p41h", "p41j"],
    "satisfaction_with_work_and_school": ["p41e", "p41g", "p41i"],
    "work_realtionships_quality": ["p311a", "p311b", "p311c", "p311e", "p311f", "p311g", "p311h"],
    "disadvantaged_students_num": ["p7an",  "p7bn", "p7cn", "p7dn", "p7en", "p7fn", "p7gn"],
    "class_participation": ["p21a", "p21b", "p21c", "p21d", "p21e", "p21f"],
    "variety_of_evaluation_methods": ["p24a", "p24b", "p24c", "p24d", "p24e", "p24f", "p24g", "p24h", "p24i", "p24j", "p24k"],
    "overall_opinion_on_school": ["p32a", "p32b", "p32c", "p32d", "p32e"],
    "class_behaviour_during_lessons": ["p12a", "p12b", "p12c", "p12d"],
    "variety_of_teaching_methodologies": ["p22a", "p22b", "p22c", "p22d", "p22e", "p22f", "p22g"],
    "variety_of_resourced_in_class": ["p23a", "p23b", "p23c", "p23d", "p23e", "p23f", "p23g", "p23h"]
    }

In [24]:
new_features = aggregate_features(df, non_binary_feature_intervals, "mean", "zeros")
new_features_df = pd.concat([new_features[k] for k in new_features.keys()], axis=1)
new_features_df.columns = new_features.keys()
new_features_df

Unnamed: 0_level_0,pfc_incidence,teacher_work_hampered,family_interest,teacher_work_facilitated,satisfaction_with_teaching,satisfaction_with_work_and_school,work_realtionships_quality,disadvantaged_students_num,class_participation,variety_of_evaluation_methods,overall_opinion_on_school,class_behaviour_during_lessons,variety_of_teaching_methodologies,variety_of_resourced_in_class
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.000000,0.000,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
2,0.000000,0.000,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
3,0.000000,0.000,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
4,2.714286,2.125,2.250,3.000000,0.0,0.0,3.000000,0.428571,2.666667,3.181818,2.8,2.75,3.000000,2.375
5,3.285714,1.250,2.750,3.857143,0.0,0.0,3.857143,0.285714,3.000000,3.272727,3.2,3.50,2.857143,2.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,0.000000,1.500,2.375,3.857143,2.1,2.4,3.428571,1.000000,2.666667,2.909091,3.0,2.50,3.142857,2.250
83854,3.857143,2.375,3.625,3.285714,1.8,2.0,3.285714,0.000000,3.000000,2.545455,3.0,3.00,3.142857,2.500
83855,0.000000,0.000,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
83856,3.000000,1.625,3.375,3.000000,1.6,1.6,3.000000,0.428571,3.000000,3.272727,2.4,3.00,3.000000,2.375


In [25]:
df = pd.concat([df, new_features_df], axis=1)

In [26]:
df

Unnamed: 0_level_0,p2,p2n,p3n,p4n,p5,p6n,p7an,p7bn,p7cn,p7dn,...,satisfaction_with_teaching,satisfaction_with_work_and_school,work_realtionships_quality,disadvantaged_students_num,class_participation,variety_of_evaluation_methods,overall_opinion_on_school,class_behaviour_during_lessons,variety_of_teaching_methodologies,variety_of_resourced_in_class
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
2,,,,,,,,,,,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
3,,,,,,,,,,,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
4,2.0,56.0,29.0,18.0,2.0,26.0,,,1.0,,...,0.0,0.0,3.000000,0.428571,2.666667,3.181818,2.8,2.75,3.000000,2.375
5,2.0,44.0,20.0,17.0,2.0,26.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.857143,0.285714,3.000000,3.272727,3.2,3.50,2.857143,2.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2.0,34.0,1.0,1.0,,,3.0,2.0,1.0,1.0,...,2.1,2.4,3.428571,1.000000,2.666667,2.909091,3.0,2.50,3.142857,2.250
83854,1.0,54.0,30.0,30.0,,,0.0,0.0,0.0,0.0,...,1.8,2.0,3.285714,0.000000,3.000000,2.545455,3.0,3.00,3.142857,2.500
83855,,,,,,,,,,,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
83856,2.0,56.0,22.0,4.0,,,1.0,1.0,1.0,0.0,...,1.6,1.6,3.000000,0.428571,3.000000,3.272727,2.4,3.00,3.000000,2.375


In [27]:
features_to_drop = features_to_drop_after_aggregation(df, non_binary_feature_intervals)
df = df.drop(df.columns[features_to_drop], axis=1, inplace=False)

In [28]:
len(features_to_drop)

93

In [29]:
df

Unnamed: 0_level_0,p2,p2n,p3n,p4n,p5,p6n,p8an,p8bn,p9a,p9b,...,satisfaction_with_teaching,satisfaction_with_work_and_school,work_realtionships_quality,disadvantaged_students_num,class_participation,variety_of_evaluation_methods,overall_opinion_on_school,class_behaviour_during_lessons,variety_of_teaching_methodologies,variety_of_resourced_in_class
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
2,,,,,,,,,,,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
3,,,,,,,,,,,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
4,2.0,56.0,29.0,18.0,2.0,26.0,0.0,26.0,1.0,1.0,...,0.0,0.0,3.000000,0.428571,2.666667,3.181818,2.8,2.75,3.000000,2.375
5,2.0,44.0,20.0,17.0,2.0,26.0,0.0,0.0,1.0,1.0,...,0.0,0.0,3.857143,0.285714,3.000000,3.272727,3.2,3.50,2.857143,2.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2.0,34.0,1.0,1.0,,,0.0,0.0,1.0,1.0,...,2.1,2.4,3.428571,1.000000,2.666667,2.909091,3.0,2.50,3.142857,2.250
83854,1.0,54.0,30.0,30.0,,,0.0,0.0,1.0,,...,1.8,2.0,3.285714,0.000000,3.000000,2.545455,3.0,3.00,3.142857,2.500
83855,,,,,,,,,,,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000
83856,2.0,56.0,22.0,4.0,,,0.0,0.0,1.0,,...,1.6,1.6,3.000000,0.428571,3.000000,3.272727,2.4,3.00,3.000000,2.375


## Binary features
Missing values are replaced with 0s but features are aggregated through a sum and not an average.

In [30]:
binary_feature_intervals = {
    "subjects_taught": ('p9a', 'p9e'),
    "pfc_main_topics": ('p15a', 'p15f'),
    "individual_training_topics": ('p18a', 'p18i'),
}

In [31]:
new_features = aggregate_features(df, binary_feature_intervals, "sum", "zeros")
new_features_df = pd.concat([new_features[k] for k in new_features.keys()], axis=1)
new_features_df.columns = new_features.keys()
new_features_df

Unnamed: 0_level_0,subjects_taught,individual_training_topics
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,2.0,0.0
5,3.0,1.0
...,...,...
83853,4.0,1.0
83854,3.0,1.0
83855,0.0,0.0
83856,2.0,3.0


In [32]:
df = pd.concat([df, new_features_df], axis=1)
df

Unnamed: 0_level_0,p2,p2n,p3n,p4n,p5,p6n,p8an,p8bn,p9a,p9b,...,work_realtionships_quality,disadvantaged_students_num,class_participation,variety_of_evaluation_methods,overall_opinion_on_school,class_behaviour_during_lessons,variety_of_teaching_methodologies,variety_of_resourced_in_class,subjects_taught,individual_training_topics
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000,0.0,0.0
2,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000,0.0,0.0
3,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000,0.0,0.0
4,2.0,56.0,29.0,18.0,2.0,26.0,0.0,26.0,1.0,1.0,...,3.000000,0.428571,2.666667,3.181818,2.8,2.75,3.000000,2.375,2.0,0.0
5,2.0,44.0,20.0,17.0,2.0,26.0,0.0,0.0,1.0,1.0,...,3.857143,0.285714,3.000000,3.272727,3.2,3.50,2.857143,2.875,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2.0,34.0,1.0,1.0,,,0.0,0.0,1.0,1.0,...,3.428571,1.000000,2.666667,2.909091,3.0,2.50,3.142857,2.250,4.0,1.0
83854,1.0,54.0,30.0,30.0,,,0.0,0.0,1.0,,...,3.285714,0.000000,3.000000,2.545455,3.0,3.00,3.142857,2.500,3.0,1.0
83855,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000,0.0,0.0
83856,2.0,56.0,22.0,4.0,,,0.0,0.0,1.0,,...,3.000000,0.428571,3.000000,3.272727,2.4,3.00,3.000000,2.375,2.0,3.0


In [33]:
features_to_drop = features_to_drop_after_aggregation(df, binary_feature_intervals)
df = df.drop(df.columns[features_to_drop], axis=1, inplace=False)

In [34]:
len(features_to_drop)

4

In [35]:
df

Unnamed: 0_level_0,p2,p2n,p3n,p4n,p5,p6n,p8an,p8bn,p9b,p9c,...,work_realtionships_quality,disadvantaged_students_num,class_participation,variety_of_evaluation_methods,overall_opinion_on_school,class_behaviour_during_lessons,variety_of_teaching_methodologies,variety_of_resourced_in_class,subjects_taught,individual_training_topics
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000,0.0,0.0
2,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000,0.0,0.0
3,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000,0.0,0.0
4,2.0,56.0,29.0,18.0,2.0,26.0,0.0,26.0,1.0,,...,3.000000,0.428571,2.666667,3.181818,2.8,2.75,3.000000,2.375,2.0,0.0
5,2.0,44.0,20.0,17.0,2.0,26.0,0.0,0.0,1.0,1.0,...,3.857143,0.285714,3.000000,3.272727,3.2,3.50,2.857143,2.875,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2.0,34.0,1.0,1.0,,,0.0,0.0,1.0,1.0,...,3.428571,1.000000,2.666667,2.909091,3.0,2.50,3.142857,2.250,4.0,1.0
83854,1.0,54.0,30.0,30.0,,,0.0,0.0,,1.0,...,3.285714,0.000000,3.000000,2.545455,3.0,3.00,3.142857,2.500,3.0,1.0
83855,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.000,0.0,0.0
83856,2.0,56.0,22.0,4.0,,,0.0,0.0,,,...,3.000000,0.428571,3.000000,3.272727,2.4,3.00,3.000000,2.375,2.0,3.0



# Postprocessing

In [7]:
from src.pre_processing.macros import agg_mean
from src.pre_processing.utils import custom_mean

In [8]:
df.loc[:, ["p331g","p331j"]].describe()

Unnamed: 0,p331g,p331j
count,27191.0,21342.0
mean,3.712478,3.682645
std,1.221123,0.646566
min,1.0,1.0
25%,3.0,3.0
50%,4.0,4.0
75%,5.0,4.0
max,5.0,5.0


In [9]:
def normalize_bad_column(m, r_min, r_max, t_min, t_max):
    if m not in range(1, 5):
        num = m - r_min
        den = r_max - r_min
        res = num/den * (t_max - t_min) + t_min
        return res
    else:
        return m

In [10]:
df["p331g"].isna().sum()


56666

In [11]:
df["p331g"].value_counts()

p331g
5.0    9029
4.0    7551
3.0    6597
1.0    2222
2.0    1792
Name: count, dtype: int64

In [12]:
df["p331j"].isna().sum()

62515

In [13]:
df["p331j"].value_counts()

p331j
4.0    13477
3.0     5892
5.0     1051
2.0      834
1.0       88
Name: count, dtype: int64

In [14]:
df["p331g"] = df["p331g"].apply(lambda x: normalize_bad_column(x, r_min=1, r_max=5, t_min=1, t_max=4))

In [15]:
df["p331g"].isna().sum()

56666

In [16]:
df["p331g"].value_counts()

p331g
4.0    16580
3.0     6597
1.0     2222
2.0     1792
Name: count, dtype: int64

In [8]:
new_features = aggregate_features(df, agg_mean, aggregation_func=custom_mean)

{'extent_of_evaluation_variety': [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90], 'extent_of_pfc_incidence': [39, 40, 41, 42, 43, 44, 45, 46], 'extent_of_work_hampered': [97, 98, 99, 99, 100, 101, 102, 103, 104], 'extent_of_family_interest': [106, 107, 108, 109, 110, 140], 'agreement_of_family_support': [111, 112, 113], 'agreement_of_work_facilitated_by_management': [120, 121, 122, 123, 124, 125, 126], 'extent_of_positive_relationships': [114, 141, 142, 143, 144, 145, 146, 147], 'number_of_special_attention_students': [6, 7, 8, 9, 10, 12], 'extent_of_student_involvement_during_class': [58, 59, 60, 61, 62, 63], 'extent_of_teaching_methods_variety': [64, 65, 66, 67, 68, 69, 70], 'agreement_of_opinion_on_school': [115, 116, 117, 118, 119], 'agreement_of_class_behaviour': [23, 24, 25, 26], 'extent_of_resource_variety': [71, 72, 73, 74, 75, 76, 77, 78, 79], 'extent_of_good_work_by_non_teachers': [148, 149, 150, 151, 152, 153, 154, 155]}


In [9]:
new_features_df = pd.concat([new_features[k] for k in new_features.keys()], axis=1)
new_features_df.columns = new_features.keys()

In [10]:
new_features_df.describe()

Unnamed: 0,extent_of_evaluation_variety,extent_of_pfc_incidence,extent_of_work_hampered,extent_of_family_interest,agreement_of_family_support,agreement_of_work_facilitated_by_management,extent_of_positive_relationships,number_of_special_attention_students,extent_of_student_involvement_during_class,extent_of_teaching_methods_variety,agreement_of_opinion_on_school,agreement_of_class_behaviour,extent_of_resource_variety,extent_of_good_work_by_non_teachers
count,53921.0,48575.0,53794.0,53851.0,53764.0,53669.0,53799.0,48578.0,53867.0,53979.0,53565.0,53943.0,53957.0,53575.0
mean,3.036033,2.930589,2.113055,2.757243,3.142778,3.359484,3.333437,0.760426,2.927991,3.147311,3.419122,3.029554,2.698459,3.526103
std,0.377532,0.631604,0.638573,0.549377,0.542375,0.581691,0.42934,0.817294,0.431063,0.430917,0.545669,0.494717,0.38535,0.599685
min,1.636364,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.857143,1.0,1.0,1.25,1.0
25%,2.727273,2.571429,1.666667,2.4,2.666667,3.0,3.0,0.166667,2.666667,2.857143,3.0,2.75,2.375,3.0
50%,3.0,3.0,2.111111,2.666667,3.0,3.428571,3.25,0.5,2.833333,3.142857,3.5,3.0,2.625,3.666667
75%,3.272727,3.285714,2.555556,3.0,3.666667,4.0,3.75,1.0,3.166667,3.428571,4.0,3.25,3.0,4.0
max,4.0,4.0,4.0,4.0,4.0,4.0,4.0,22.0,4.0,4.0,4.0,4.0,4.0,5.0
