# Imports

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree

from src.pre_processing.macros import column_groups, DATA_PATH, DATA_SPLIT_PATH, ORIGINAL_DATASET_NAME

# Loading

In [2]:
df = pd.read_csv(
    os.path.join(DATA_SPLIT_PATH, "family_questionnaire.csv"), low_memory=False
)
df = df.set_index("id_student")
df

Unnamed: 0_level_0,f0,f1n,f2an,f2bn,f3a,f3b,mother_education,father_education,f4a,f4b,...,f33c,f33d,f33e,f33f,f33g,f33h,f34,household_income_q,nhousehold,ESCS
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,3.0,43.0,47.0,7.0,3.0,4.0,2.0,1.0,3.0,...,,,,,,,,,,0.235340
2,,,,,,,,,,,...,,,,,,,,,,
3,1.0,4.0,45.0,41.0,3.0,5.0,2.0,4.0,3.0,1.0,...,,,,,,,,,,0.261451
4,1.0,4.0,39.0,41.0,5.0,5.0,4.0,4.0,1.0,1.0,...,,,,,,,,,,0.787122
5,2.0,4.0,39.0,41.0,9.0,7.0,4.0,4.0,1.0,1.0,...,,,,,,,,,,3.151773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,,,,,,,,,,,...,,,,,,,,,,
83854,1.0,3.0,45.0,44.0,7.0,7.0,4.0,4.0,1.0,1.0,...,,,,,,,9.0,4.0,,0.761954
83855,2.0,4.0,40.0,42.0,5.0,5.0,4.0,4.0,1.0,1.0,...,,,,,,,5.0,2.0,,0.633218
83856,1.0,5.0,40.0,,2.0,,1.0,,1.0,,...,,,,,,1.0,10.0,,,-1.506239


In [3]:
# Load identifiers and change float columns to int
ids = pd.read_csv(os.path.join(DATA_SPLIT_PATH, "identifiers.csv"), low_memory=False)
ids = ids.set_index("id_student")
int_identifiers = [col for col in ids.columns if col not in ["id_class_group"]]
ids[int_identifiers] = ids[int_identifiers].astype("Int64")
ids

Unnamed: 0_level_0,id_student_original,id_year,id_grade,id_class_group,id_school,id_student_16_19,id_school_16_19
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,33613,2016,3,A,2415,14374,569
2,19294,2016,3,A,1842,8001,273
3,19587,2016,3,,1432,8142,82
4,29985,2016,3,A,2280,12800,505
5,6982,2016,3,A,2040,2606,390
...,...,...,...,...,...,...,...
83853,2500,2019,6,B,1278,,271
83854,5784,2019,6,A,1079,,78
83855,7708,2019,6,X,1486,3036,459
83856,18965,2019,6,B,1036,9913,36


In [4]:
# Load identifiers and change float columns to int
scores = pd.read_csv(
    os.path.join(DATA_SPLIT_PATH, "student_scores.csv"), low_memory=False
)
scores = scores.set_index("id_student")
# int_identifiers = [col for col in ids.columns if col not in ["id_class_group"]]
# ids[int_identifiers] = ids[int_identifiers].astype('Int64')
scores

Unnamed: 0_level_0,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,564.8700,3.0,535.1500,3.0,,
2,388.3400,1.0,293.7000,1.0,,
3,386.5900,1.0,514.8100,3.0,,
4,487.7600,2.0,449.2500,2.0,,
5,709.7900,4.0,598.7200,3.0,,
...,...,...,...,...,...,...
83853,400.8625,2.0,446.6522,2.0,294.7474,1.0
83854,597.0243,3.0,632.6043,4.0,633.2966,4.0
83855,707.9254,4.0,400.2761,2.0,477.5056,2.0
83856,522.8511,3.0,656.1601,4.0,540.1122,3.0


In [5]:
ids = pd.merge(ids, scores, left_index=True, right_index=True)
df = pd.merge(ids, df, left_index=True, right_index=True)
df

Unnamed: 0_level_0,id_student_original,id_year,id_grade,id_class_group,id_school,id_student_16_19,id_school_16_19,score_MAT,level_MAT,score_LEN,...,f33c,f33d,f33e,f33f,f33g,f33h,f34,household_income_q,nhousehold,ESCS
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,33613,2016,3,A,2415,14374,569,564.8700,3.0,535.1500,...,,,,,,,,,,0.235340
2,19294,2016,3,A,1842,8001,273,388.3400,1.0,293.7000,...,,,,,,,,,,
3,19587,2016,3,,1432,8142,82,386.5900,1.0,514.8100,...,,,,,,,,,,0.261451
4,29985,2016,3,A,2280,12800,505,487.7600,2.0,449.2500,...,,,,,,,,,,0.787122
5,6982,2016,3,A,2040,2606,390,709.7900,4.0,598.7200,...,,,,,,,,,,3.151773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2500,2019,6,B,1278,,271,400.8625,2.0,446.6522,...,,,,,,,,,,
83854,5784,2019,6,A,1079,,78,597.0243,3.0,632.6043,...,,,,,,,9.0,4.0,,0.761954
83855,7708,2019,6,X,1486,3036,459,707.9254,4.0,400.2761,...,,,,,,,5.0,2.0,,0.633218
83856,18965,2019,6,B,1036,9913,36,522.8511,3.0,656.1601,...,,,,,,1.0,10.0,,,-1.506239


# Analysis

## Utils

In [6]:
def print_statistics(columns):
    print("MISSING VALUES")
    # print(df[columns].isna().sum())
    # print()
    # print("Percentage")
    print(df[columns].isna().sum() / df.shape[0] * 100)
    # print()
    # print("DISTINCT VALUES")
    # print(df[columns].value_counts())
    # print()

    corr = df[
        columns
        + [
            "score_MAT",
            "level_MAT",
            "score_LEN",
            "level_LEN",
            "score_ING",
            "level_ING",
        ]
    ].corr()
    
    return corr

## f0

In [8]:
col = "f0"
print("OLD")
print(df[col].value_counts())
print(f"""Missing: {df[col].isna().sum()}""")
print()
print("NEW")
new_df = df[col].apply(
    lambda x: (
        "MOTHER"
        if x == 1
        else ("FATHER" if x == 2 else ("OTHER" if x == 3 else np.nan))
    )
)
print(new_df.value_counts())
print(f"Missing: {new_df.isna().sum()}")
new_df

OLD
f0
1.0    38950
2.0     9731
3.0      429
Name: count, dtype: int64
Missing: 34747

NEW
f0
MOTHER    38950
FATHER     9731
OTHER       429
Name: count, dtype: int64
Missing: 34747


id_student
1        MOTHER
2           NaN
3        MOTHER
4        MOTHER
5        FATHER
          ...  
83853       NaN
83854    MOTHER
83855    FATHER
83856    MOTHER
83857    MOTHER
Name: f0, Length: 83857, dtype: object

## f5a, f5b

In [9]:
for col in ["f5a", "f5b"]:
    print("OLD")
    print(df[col].value_counts())
    print(f"""Missing: {df[col].isna().sum()}""")
    print()
    print("NEW")
    new_df = df[col].apply(
        lambda x: (
                "CANARY_ISLANDS"
                if x == 1
                else (
                    "SPAIN_NO_CANARY_ISLANDS"
                    if x == 2
                    else (
                        "ANOTHER_EU"
                        if x == 3
                        else ("ANOTHER_NON_EU" if x == 4 else np.nan)
                    )
                )
            )
    )
    print(new_df.value_counts())
    print(f"Missing: {new_df.isna().sum()}")
    new_df

OLD
f5a
1.0    39836
2.0     6966
4.0     6827
3.0     2324
Name: count, dtype: int64
Missing: 27904

NEW
f5a
CANARY_ISLANDS             39836
SPAIN_NO_CANARY_ISLANDS     6966
ANOTHER_NON_EU              6827
ANOTHER_EU                  2324
Name: count, dtype: int64
Missing: 27904
OLD
f5b
1.0    37672
2.0     7462
4.0     5922
3.0     2062
Name: count, dtype: int64
Missing: 30739

NEW
f5b
CANARY_ISLANDS             37672
SPAIN_NO_CANARY_ISLANDS     7462
ANOTHER_NON_EU              5922
ANOTHER_EU                  2062
Name: count, dtype: int64
Missing: 30739


## extent_of_years_in_spanish_education_system

In [7]:
current_columns = ["f6"]
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
f6    81.535233
dtype: float64


Unnamed: 0,f6,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
f6,1.0,0.044848,0.0424,0.094156,0.080431,0.03405,0.02266
score_MAT,0.044848,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.0424,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.094156,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.080431,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,0.03405,0.457797,0.435458,0.503339,0.481747,1.0,0.955457
level_ING,0.02266,0.435502,0.416475,0.482328,0.46406,0.955457,1.0


## f7

In [9]:
print(df["f7"].apply(
        lambda x: "SPANISH" if x == 1 else ("OTHER" if x == 2 else np.nan)
    ).value_counts())
print(df["f7"].value_counts())

f7
SPANISH    55626
OTHER       2971
Name: count, dtype: int64
f7
1.0    55626
2.0     2971
Name: count, dtype: int64


## f9a, ..., f9f

In [45]:
column_number = 9
column_letters = ["a", "b", "c", "d", "e", "f"]
current_columns = [f"f{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['f9a', 'f9b', 'f9c', 'f9d', 'f9e', 'f9f']
MISSING VALUES
f9a    66.198409
f9b    30.772625
f9c    32.129697
f9d    30.776202
f9e    31.910276
f9f    38.769572
dtype: float64


Unnamed: 0,f9a,f9b,f9c,f9d,f9e,f9f,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
f9a,1.0,0.224584,0.17228,0.078229,0.057522,-0.009109,0.060636,0.057634,0.088513,0.082646,0.086127,0.086052
f9b,0.224584,1.0,0.417885,0.081244,-0.040169,-0.169611,0.119932,0.116039,0.168586,0.163195,0.188831,0.17824
f9c,0.17228,0.417885,1.0,0.176215,0.087307,-0.076012,0.112707,0.107411,0.131916,0.128545,0.137773,0.131297
f9d,0.078229,0.081244,0.176215,1.0,0.616367,0.192052,0.118317,0.114317,0.122758,0.117164,0.16189,0.157553
f9e,0.057522,-0.040169,0.087307,0.616367,1.0,0.442496,0.078951,0.07683,0.084583,0.079961,0.104691,0.104104
f9f,-0.009109,-0.169611,-0.076012,0.192052,0.442496,1.0,-0.04124,-0.038901,-0.027828,-0.027328,-0.042515,-0.041518
score_MAT,0.060636,0.119932,0.112707,0.118317,0.078951,-0.04124,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.057634,0.116039,0.107411,0.114317,0.07683,-0.038901,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.088513,0.168586,0.131916,0.122758,0.084583,-0.027828,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.082646,0.163195,0.128545,0.117164,0.079961,-0.027328,0.458407,0.439989,0.94985,1.0,0.481747,0.46406


## f12a, f12b

In [46]:
column_number = 12
column_letters = ["a", "b"]
current_columns = [f"f{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['f12a', 'f12b']
MISSING VALUES
f12a    40.699047
f12b    41.455096
dtype: float64


Unnamed: 0,f12a,f12b,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
f12a,1.0,0.301366,0.106867,0.104147,0.12749,0.124033,0.156342,0.150911
f12b,0.301366,1.0,0.021588,0.022147,0.036274,0.034992,0.036556,0.031312
score_MAT,0.106867,0.021588,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.104147,0.022147,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.12749,0.036274,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.124033,0.034992,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,0.156342,0.036556,0.457797,0.435458,0.503339,0.481747,1.0,0.955457
level_ING,0.150911,0.031312,0.435502,0.416475,0.482328,0.46406,0.955457,1.0


## f14a, f14b

In [20]:
cols = ["f14a", "f14b"]
for col in cols:
    df[col] = df[col].apply(
                lambda x: (
                    "NEVER"
                    if x == 1
                    else (
                        "SOMETIMES"
                        if x == 2
                        else (
                            "ONCE_PER_MONTH"
                            if x == 3
                            else ("ONCE_PER_WEEK" if x == 4 else "DONT_KNOW")
                        )
                    )
                )
            )
df[cols].value_counts()

f14a            f14b          
DONT_KNOW       DONT_KNOW         34321
ONCE_PER_WEEK   ONCE_PER_WEEK     12616
ONCE_PER_MONTH  ONCE_PER_MONTH     6174
SOMETIMES       SOMETIMES          5214
ONCE_PER_MONTH  NEVER              3568
ONCE_PER_WEEK   NEVER              3200
SOMETIMES       NEVER              3179
ONCE_PER_WEEK   DONT_KNOW          2819
ONCE_PER_MONTH  SOMETIMES          2591
ONCE_PER_WEEK   ONCE_PER_MONTH     2330
ONCE_PER_MONTH  DONT_KNOW          1997
ONCE_PER_WEEK   SOMETIMES          1819
SOMETIMES       DONT_KNOW          1170
NEVER           NEVER               591
SOMETIMES       ONCE_PER_MONTH      390
ONCE_PER_MONTH  ONCE_PER_WEEK       307
DONT_KNOW       ONCE_PER_WEEK       254
NEVER           SOMETIMES           232
DONT_KNOW       ONCE_PER_MONTH      205
SOMETIMES       ONCE_PER_WEEK       200
DONT_KNOW       SOMETIMES           172
NEVER           ONCE_PER_WEEK       156
                ONCE_PER_MONTH      148
                DONT_KNOW           134
DONT_KNOW

In [10]:
current_columns = ["f14a"]
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
f14a    41.612507
dtype: float64


Unnamed: 0,f14a,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
f14a,1.0,-0.054215,-0.051304,-0.030321,-0.025304,-0.06371,-0.058539
score_MAT,-0.054215,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,-0.051304,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,-0.030321,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,-0.025304,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,-0.06371,0.457797,0.435458,0.503339,0.481747,1.0,0.955457
level_ING,-0.058539,0.435502,0.416475,0.482328,0.46406,0.955457,1.0


In [213]:
print("DISTINCT VALUES")
df[current_columns].value_counts()

DISTINCT VALUES


a4   repeater  a41  a42
1.0  1.0       1.0  1.0    2133
2.0  2.0       1.0  2.0     417
               2.0  1.0     153
                    2.0     127
               1.0  3.0      88
               2.0  3.0      13
Name: count, dtype: int64

## f15a, ..., f15f

In [None]:
column_number = 15
column_letters = ["a", "b", "c", "d", "e", "f"]
current_columns = [f"f{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['f15a', 'f15b', 'f15c', 'f15d', 'f15e', 'f15f']
MISSING VALUES
f15a    41.217787
f15b    41.204670
f15c    41.393086
f15d    41.623240
f15e    41.576732
f15f    78.878329
dtype: float64


Unnamed: 0,f15a,f15b,f15c,f15d,f15e,f15f,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
f15a,1.0,0.392326,0.365168,0.337197,0.315463,0.321591,-0.101592,-0.097641,-0.071323,-0.07007,-0.106314,-0.101949
f15b,0.392326,1.0,0.639867,0.524934,0.547083,0.553162,-0.076801,-0.073485,-0.053412,-0.050089,-0.081558,-0.078358
f15c,0.365168,0.639867,1.0,0.664599,0.695255,0.469405,-0.033613,-0.035216,-0.020543,-0.017165,-0.042458,-0.039575
f15d,0.337197,0.524934,0.664599,1.0,0.797387,0.490397,-0.033517,-0.035076,-0.019769,-0.018049,-0.03236,-0.030545
f15e,0.315463,0.547083,0.695255,0.797387,1.0,0.510494,-0.016185,-0.016793,-0.00389,-0.003104,-0.013908,-0.01273
f15f,0.321591,0.553162,0.469405,0.490397,0.510494,1.0,-0.087052,-0.085447,-0.050181,-0.047208,-0.079188,-0.078124
score_MAT,-0.101592,-0.076801,-0.033613,-0.033517,-0.016185,-0.087052,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,-0.097641,-0.073485,-0.035216,-0.035076,-0.016793,-0.085447,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,-0.071323,-0.053412,-0.020543,-0.019769,-0.00389,-0.050181,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,-0.07007,-0.050089,-0.017165,-0.018049,-0.003104,-0.047208,0.458407,0.439989,0.94985,1.0,0.481747,0.46406


## f16a, ..., f16f

In [49]:
column_number = 16
column_letters = ["a", "b", "c", "d", "e", "f"]
current_columns = [f"f{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['f16a', 'f16b', 'f16c', 'f16d', 'f16e', 'f16f']
MISSING VALUES
f16a    40.800410
f16b    41.168895
f16c    41.054414
f16d    66.395173
f16e    40.888656
f16f    66.405905
dtype: float64


Unnamed: 0,f16a,f16b,f16c,f16d,f16e,f16f,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
f16a,1.0,0.344646,0.226296,0.18947,0.287656,0.186055,0.013391,0.010503,0.057739,0.050187,0.030335,0.02984
f16b,0.344646,1.0,0.392867,0.237596,0.407355,0.184047,-0.037136,-0.036686,-0.01097,-0.012085,-0.042916,-0.04144
f16c,0.226296,0.392867,1.0,0.241995,0.415162,0.154567,-0.136282,-0.130834,-0.098874,-0.09636,-0.158357,-0.153342
f16d,0.18947,0.237596,0.241995,1.0,0.219186,0.235952,-0.032137,-0.031606,-0.012195,-0.013569,-0.051415,-0.051753
f16e,0.287656,0.407355,0.415162,0.219186,1.0,0.277278,-0.054289,-0.051287,-0.015531,-0.01525,-0.064421,-0.062798
f16f,0.186055,0.184047,0.154567,0.235952,0.277278,1.0,-0.016373,-0.015644,-0.010444,-0.011689,-0.042308,-0.043132
score_MAT,0.013391,-0.037136,-0.136282,-0.032137,-0.054289,-0.016373,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.010503,-0.036686,-0.130834,-0.031606,-0.051287,-0.015644,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.057739,-0.01097,-0.098874,-0.012195,-0.015531,-0.010444,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.050187,-0.012085,-0.09636,-0.013569,-0.01525,-0.011689,0.458407,0.439989,0.94985,1.0,0.481747,0.46406


## f17a, ..., f17d

In [50]:
column_number = 17
column_letters = ["a", "b", "c", "d"]
current_columns = [f"f{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['f17a', 'f17b', 'f17c', 'f17d']
MISSING VALUES
f17a    41.134312
f17b    41.363273
f17c    41.481331
f17d    41.550497
dtype: float64


Unnamed: 0,f17a,f17b,f17c,f17d,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
f17a,1.0,0.482643,0.375685,0.391527,-0.006048,-0.005978,0.017794,0.018427,0.006417,0.007499
f17b,0.482643,1.0,0.474586,0.44508,-0.019745,-0.019682,0.004555,0.002993,-0.007129,-0.004066
f17c,0.375685,0.474586,1.0,0.500102,-0.006837,-0.004967,0.017159,0.015014,-0.009723,-0.007645
f17d,0.391527,0.44508,0.500102,1.0,-0.01874,-0.017976,0.006054,0.004178,-0.006492,-0.006024
score_MAT,-0.006048,-0.019745,-0.006837,-0.01874,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,-0.005978,-0.019682,-0.004967,-0.017976,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.017794,0.004555,0.017159,0.006054,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.018427,0.002993,0.015014,0.004178,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,0.006417,-0.007129,-0.009723,-0.006492,0.457797,0.435458,0.503339,0.481747,1.0,0.955457
level_ING,0.007499,-0.004066,-0.007645,-0.006024,0.435502,0.416475,0.482328,0.46406,0.955457,1.0


## f18a, ..., f18i

In [51]:
column_number = 18
column_letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
current_columns = [f"f{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['f18a', 'f18b', 'f18c', 'f18d', 'f18e', 'f18f', 'f18g', 'f18h', 'f18i']
MISSING VALUES
f18a    40.697855
f18b    40.906543
f18c    41.091382
f18d    40.928008
f18e    40.888656
f18f    41.301263
f18g    40.981671
f18h    41.047259
f18i    40.886271
dtype: float64


Unnamed: 0,f18a,f18b,f18c,f18d,f18e,f18f,f18g,f18h,f18i,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
f18a,1.0,0.581947,0.70235,0.667738,0.378557,0.556777,0.641585,0.553501,0.645933,0.038063,0.035093,0.049634,0.044284,0.041621,0.037455
f18b,0.581947,1.0,0.50971,0.40166,0.305791,0.524758,0.497305,0.518909,0.709179,-0.010314,-0.011757,-0.006543,-0.006765,-0.010643,-0.013349
f18c,0.70235,0.50971,1.0,0.63445,0.42729,0.592914,0.694336,0.556613,0.602155,0.038187,0.035525,0.056943,0.050531,0.036553,0.035708
f18d,0.667738,0.40166,0.63445,1.0,0.343728,0.473,0.536639,0.450819,0.493105,0.036212,0.033734,0.04879,0.04363,0.042427,0.039354
f18e,0.378557,0.305791,0.42729,0.343728,1.0,0.467346,0.429159,0.608373,0.43875,0.047589,0.044115,0.05296,0.049083,0.048597,0.047152
f18f,0.556777,0.524758,0.592914,0.473,0.467346,1.0,0.634501,0.583425,0.620752,0.010237,0.009109,0.018416,0.01541,0.014689,0.012168
f18g,0.641585,0.497305,0.694336,0.536639,0.429159,0.634501,1.0,0.616503,0.64635,0.099091,0.094815,0.113323,0.104137,0.107404,0.101329
f18h,0.553501,0.518909,0.556613,0.450819,0.608373,0.583425,0.616503,1.0,0.714082,0.048642,0.046838,0.058318,0.054663,0.058253,0.054897
f18i,0.645933,0.709179,0.602155,0.493105,0.43875,0.620752,0.64635,0.714082,1.0,0.027558,0.025327,0.032085,0.028716,0.02716,0.02279
score_MAT,0.038063,-0.010314,0.038187,0.036212,0.047589,0.010237,0.099091,0.048642,0.027558,1.0,0.949125,0.478365,0.458407,0.457797,0.435502


## f19a, ... f19e

In [52]:
column_number = 19
column_letters = ["a", "b", "c", "d", "e"]
current_columns = [f"f{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['f19a', 'f19b', 'f19c', 'f19d', 'f19e']
MISSING VALUES
f19a    40.891041
f19b    41.069917
f19c    41.111654
f19d    41.083034
f19e    41.129542
dtype: float64


Unnamed: 0,f19a,f19b,f19c,f19d,f19e,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
f19a,1.0,0.951855,0.956005,0.946593,0.943447,0.008468,0.006586,-0.006183,-0.009424,0.008854,0.011589
f19b,0.951855,1.0,0.951455,0.95928,0.948562,0.009225,0.007365,-0.005756,-0.009004,0.009016,0.011789
f19c,0.956005,0.951455,1.0,0.955925,0.941321,0.010021,0.007912,-0.004744,-0.008098,0.010885,0.014223
f19d,0.946593,0.95928,0.955925,1.0,0.951235,0.009945,0.007373,-0.006606,-0.010306,0.00858,0.011825
f19e,0.943447,0.948562,0.941321,0.951235,1.0,0.009909,0.007265,-0.005247,-0.007375,0.008821,0.012101
score_MAT,0.008468,0.009225,0.010021,0.009945,0.009909,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.006586,0.007365,0.007912,0.007373,0.007265,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,-0.006183,-0.005756,-0.004744,-0.006606,-0.005247,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,-0.009424,-0.009004,-0.008098,-0.010306,-0.007375,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,0.008854,0.009016,0.010885,0.00858,0.008821,0.457797,0.435458,0.503339,0.481747,1.0,0.955457


## f20

In [23]:
df["f20"].apply(lambda x: 0 if x == 2 else x).astype(bool).value_counts()

f20
True     80726
False     3131
Name: count, dtype: int64

## f23

In [33]:
df["f23"].apply(
        lambda x: (
            "4_ESO"
            if x == 1
            else (
                "INT_FP"
                if x == 2
                else (
                    "BACH_ATO"
                    if x == 3
                    else (
                        "UP_FP"
                        if x == 4
                        else (
                            "BACH_DEG"
                            if x == 5
                            else ("DONT_KNOW" if x == 9 else np.nan)
                        )
                    )
                )
            )
        )
    ).unique()

array(['BACH_DEG', nan, 'BACH_ATO', 'UP_FP', '4_ESO', 'INT_FP',
       'DONT_KNOW'], dtype=object)

In [221]:
current_columns = ["a6nm", "a7", "a61", "a71",]
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

MISSING VALUES
a6nm    62.861777
a7      19.784872
a61     50.394123
a71     74.969293
dtype: float64


Unnamed: 0,a6nm,a7,a61,a71,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
a6nm,1.0,0.070558,-0.038283,,0.045842,0.040847,0.034946,0.036581,0.008578,0.009152
a7,0.070558,1.0,0.095849,0.197868,0.069256,0.066814,0.076241,0.072528,0.087214,0.083958
a61,-0.038283,0.095849,1.0,,0.038819,0.038117,0.055529,0.043835,0.113357,0.109543
a71,,0.197868,,1.0,0.038828,0.0343,0.04343,0.038468,0.026931,0.028033
score_MAT,0.045842,0.069256,0.038819,0.038828,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.040847,0.066814,0.038117,0.0343,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.034946,0.076241,0.055529,0.04343,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.036581,0.072528,0.043835,0.038468,0.458407,0.439989,0.94985,1.0,0.481747,0.46406
score_ING,0.008578,0.087214,0.113357,0.026931,0.457797,0.435458,0.503339,0.481747,1.0,0.955457
level_ING,0.009152,0.083958,0.109543,0.028033,0.435502,0.416475,0.482328,0.46406,0.955457,1.0


In [225]:
df[["a6nm", "a7", "a61", "a71"]]
df[["a6nm", "a7"]].value_counts()

a6nm  a7 
30.0  4.0    6476
      3.0    4789
      2.0    2455
0.0   4.0    1472
      3.0    1045
             ... 
14.0  0.0       1
75.0  1.0       1
      2.0       1
      3.0       1
3.0   0.0       1
Name: count, Length: 308, dtype: int64

## f31

In [36]:
df["f31"].apply(
        lambda x: (
            "MOTHER_FATHER_CHILDREN"
            if x == 1
            else (
                "MOTHER_PARTNER_CHILDREN"
                if x == 2
                else (
                    "FATHER_PARTNER_CHILDREN"
                    if x == 3
                    else (
                        "MOTHER_CHILDREN"
                        if x == 4
                        else (
                            "FATHER_CHILDREN"
                            if x == 5
                            else (
                                "RELATIVES_CHILDREN"
                                if x == 6
                                else ("OTHERS" if x == 7 else np.nan)
                            )
                        )
                    )
                )
            )
        )
    ).unique()

array([nan, 'MOTHER_FATHER_CHILDREN', 'MOTHER_PARTNER_CHILDREN',
       'MOTHER_CHILDREN', 'FATHER_PARTNER_CHILDREN', 'RELATIVES_CHILDREN',
       'FATHER_CHILDREN', 'OTHERS'], dtype=object)

In [7]:
df["f34"].apply(
        lambda x: "NO_ANSWER" if x == 10 else x
    )

id_student
1              NaN
2              NaN
3              NaN
4              NaN
5              NaN
           ...    
83853          NaN
83854          9.0
83855          5.0
83856    NO_ANSWER
83857          3.0
Name: f34, Length: 83857, dtype: object