# Imports

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree

from src.pre_processing.macros import column_groups, DATA_PATH, DATA_SPLIT_PATH, ORIGINAL_DATASET_NAME

# Loading

In [2]:
df = pd.read_csv(
    os.path.join(DATA_SPLIT_PATH, "principal_questionnaire.csv"), low_memory=False
)
df = df.set_index("id_student")
df

Unnamed: 0_level_0,d1,d2n,d3n,d4n,d5n,d6n,d7n,d8n,d9a1,d9a2,...,tasa_nac_pri3,tasa_nac_pri6,distnac,distnac_eso4,distnac_pri3,distnac_pri6,groups,island,capital_island,public_private
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,50.0,17.0,3.0,3.0,3.0,12.0,3.0,275.0,13.0,...,,,,,,,,4.0,1.0,2.0
2,2.0,58.0,31.0,19.0,11.0,11.0,6.0,1.0,377.0,16.0,...,,,,,,,,1.0,1.0,2.0
3,2.0,45.0,15.0,7.0,8.0,7.0,10.0,2.0,658.0,26.0,...,,,,,,,,1.0,1.0,1.0
4,1.0,46.0,20.0,9.0,3.0,3.0,10.0,4.0,185.0,9.0,...,,,,,,,,4.0,1.0,2.0
5,1.0,56.0,25.0,24.0,14.0,14.0,13.0,1.0,,,...,,,,,,,,4.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2.0,59.0,34.0,23.0,5.0,4.0,8.0,2.0,,,...,,,,,,,,1.0,1.0,2.0
83854,2.0,63.0,32.0,4.0,12.0,4.0,4.0,2.0,,,...,,,,,,,,1.0,1.0,1.0
83855,2.0,54.0,23.0,10.0,3.0,3.0,14.0,2.0,,,...,,,,,,,,4.0,1.0,2.0
83856,1.0,59.0,36.0,36.0,12.0,12.0,10.0,1.0,,,...,,,,,,,,1.0,1.0,2.0


In [3]:
# Load identifiers and change float columns to int
ids = pd.read_csv(os.path.join(DATA_SPLIT_PATH, "identifiers.csv"), low_memory=False)
ids = ids.set_index("id_student")
int_identifiers = [col for col in ids.columns if col not in ["id_class_group"]]
ids[int_identifiers] = ids[int_identifiers].astype("Int64")
ids

Unnamed: 0_level_0,id_student_original,id_year,id_grade,id_class_group,id_school,id_student_16_19,id_school_16_19
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,33613,2016,3,A,2415,14374,569
2,19294,2016,3,A,1842,8001,273
3,19587,2016,3,,1432,8142,82
4,29985,2016,3,A,2280,12800,505
5,6982,2016,3,A,2040,2606,390
...,...,...,...,...,...,...,...
83853,2500,2019,6,B,1278,,271
83854,5784,2019,6,A,1079,,78
83855,7708,2019,6,X,1486,3036,459
83856,18965,2019,6,B,1036,9913,36


In [4]:
# Load identifiers and change float columns to int
scores = pd.read_csv(
    os.path.join(DATA_SPLIT_PATH, "student_scores.csv"), low_memory=False
)
scores = scores.set_index("id_student")
# int_identifiers = [col for col in ids.columns if col not in ["id_class_group"]]
# ids[int_identifiers] = ids[int_identifiers].astype('Int64')
scores

Unnamed: 0_level_0,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,564.8700,3.0,535.1500,3.0,,
2,388.3400,1.0,293.7000,1.0,,
3,386.5900,1.0,514.8100,3.0,,
4,487.7600,2.0,449.2500,2.0,,
5,709.7900,4.0,598.7200,3.0,,
...,...,...,...,...,...,...
83853,400.8625,2.0,446.6522,2.0,294.7474,1.0
83854,597.0243,3.0,632.6043,4.0,633.2966,4.0
83855,707.9254,4.0,400.2761,2.0,477.5056,2.0
83856,522.8511,3.0,656.1601,4.0,540.1122,3.0


In [5]:
ids = pd.merge(ids, scores, left_index=True, right_index=True)
df = pd.merge(ids, df, left_index=True, right_index=True)
df

Unnamed: 0_level_0,id_student_original,id_year,id_grade,id_class_group,id_school,id_student_16_19,id_school_16_19,score_MAT,level_MAT,score_LEN,...,tasa_nac_pri3,tasa_nac_pri6,distnac,distnac_eso4,distnac_pri3,distnac_pri6,groups,island,capital_island,public_private
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,33613,2016,3,A,2415,14374,569,564.8700,3.0,535.1500,...,,,,,,,,4.0,1.0,2.0
2,19294,2016,3,A,1842,8001,273,388.3400,1.0,293.7000,...,,,,,,,,1.0,1.0,2.0
3,19587,2016,3,,1432,8142,82,386.5900,1.0,514.8100,...,,,,,,,,1.0,1.0,1.0
4,29985,2016,3,A,2280,12800,505,487.7600,2.0,449.2500,...,,,,,,,,4.0,1.0,2.0
5,6982,2016,3,A,2040,2606,390,709.7900,4.0,598.7200,...,,,,,,,,4.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2500,2019,6,B,1278,,271,400.8625,2.0,446.6522,...,,,,,,,,1.0,1.0,2.0
83854,5784,2019,6,A,1079,,78,597.0243,3.0,632.6043,...,,,,,,,,1.0,1.0,1.0
83855,7708,2019,6,X,1486,3036,459,707.9254,4.0,400.2761,...,,,,,,,,4.0,1.0,2.0
83856,18965,2019,6,B,1036,9913,36,522.8511,3.0,656.1601,...,,,,,,,,1.0,1.0,2.0


# Analysis

## Utils

In [6]:
def print_statistics(columns):
    print("MISSING VALUES")
    # print(df[columns].isna().sum())
    # print()
    # print("Percentage")
    print(df[columns].isna().sum() / df.shape[0] * 100)
    # print()
    # print("DISTINCT VALUES")
    # print(df[columns].value_counts())
    # print()

    corr = df[
        columns
        + [
            "score_MAT",
            "level_MAT",
            "score_LEN",
            "level_LEN",
            "score_ING",
            "level_ING",
        ]
    ].corr()
    
    return corr

## d1

In [7]:
col = "d1"
print("OLD")
print(df[col].value_counts())
print(f"""Missing: {df[col].isna().sum()}""")
print()
print("NEW")
new_df = df[col].apply(lambda x: "MALE" if x == 1 else ("FEMALE" if x == 2 else np.nan))
print(new_df.value_counts())
print(f"Missing: {new_df.isna().sum()}")
new_df

OLD
d1
2.0    40816
1.0    26224
Name: count, dtype: int64
Missing: 16817

NEW
d1
FEMALE    40816
MALE      26224
Name: count, dtype: int64
Missing: 16817


id_student
1          MALE
2        FEMALE
3        FEMALE
4          MALE
5          MALE
          ...  
83853    FEMALE
83854    FEMALE
83855    FEMALE
83856      MALE
83857      MALE
Name: d1, Length: 83857, dtype: object

## d17a, ..., d17h

In [8]:
column_number = 17
column_letters = ["a", "b", "c", "d", "e", "f", "g", "h"]
current_columns = [f"d{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['d17a', 'd17b', 'd17c', 'd17d', 'd17e', 'd17f', 'd17g', 'd17h']
MISSING VALUES
d17a    10.383152
d17b    10.701551
d17c    10.595418
d17d    10.368842
d17e    10.517905
d17f    10.459473
d17g    10.463050
d17h    10.399847
dtype: float64


Unnamed: 0,d17a,d17b,d17c,d17d,d17e,d17f,d17g,d17h,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
d17a,1.0,0.143033,0.165852,0.051138,0.238701,0.209618,0.033007,0.184019,-0.042562,-0.040485,-0.032871,-0.030137,-0.072747,-0.069191
d17b,0.143033,1.0,0.259116,0.226651,0.350842,0.360548,0.255334,0.198433,-0.050444,-0.048392,-0.055605,-0.052426,-0.080383,-0.072192
d17c,0.165852,0.259116,1.0,0.156001,0.280926,0.196043,0.112113,0.172591,-0.138179,-0.13157,-0.129346,-0.124333,-0.185693,-0.175657
d17d,0.051138,0.226651,0.156001,1.0,0.414681,0.186404,0.287247,0.227008,-0.006171,-0.005433,-0.007462,-0.00783,-0.01896,-0.01867
d17e,0.238701,0.350842,0.280926,0.414681,1.0,0.291448,0.310887,0.385844,-0.031678,-0.030399,-0.032204,-0.031003,-0.061349,-0.057823
d17f,0.209618,0.360548,0.196043,0.186404,0.291448,1.0,0.164727,0.223079,-0.036134,-0.036344,-0.037483,-0.033741,-0.059712,-0.055591
d17g,0.033007,0.255334,0.112113,0.287247,0.310887,0.164727,1.0,0.303917,-0.010781,-0.009796,-0.01909,-0.017317,-0.031387,-0.030321
d17h,0.184019,0.198433,0.172591,0.227008,0.385844,0.223079,0.303917,1.0,0.00172,0.002478,-0.003078,-0.00455,-0.016053,-0.014066
score_MAT,-0.042562,-0.050444,-0.138179,-0.006171,-0.031678,-0.036134,-0.010781,0.00172,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,-0.040485,-0.048392,-0.13157,-0.005433,-0.030399,-0.036344,-0.009796,0.002478,0.949125,1.0,0.456791,0.439989,0.435458,0.416475


## d18a, ..., d18n

In [10]:
column_number = 18
column_letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n"]
current_columns = [f"d{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['d18a', 'd18b', 'd18c', 'd18d', 'd18e', 'd18f', 'd18g', 'd18h', 'd18i', 'd18j', 'd18k', 'd18l', 'd18m', 'd18n']
MISSING VALUES
d18a    10.438008
d18b    10.379575
d18c    10.553681
d18d    10.755214
d18e    10.337837
d18f    10.268672
d18g    10.331875
d18h    10.529831
d18i    10.311602
d18j    10.554873
d18k    10.315179
d18l    10.447548
d18m    10.438008
d18n    65.646279
dtype: float64


Unnamed: 0,d18a,d18b,d18c,d18d,d18e,d18f,d18g,d18h,d18i,d18j,d18k,d18l,d18m,d18n,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
d18a,1.0,0.502942,0.399819,0.381158,0.298599,0.350784,0.302043,0.302301,0.207602,0.182805,0.406414,0.225017,0.038474,0.164137,-0.089221,-0.086975,-0.088986,-0.082613,-0.119767,-0.115363
d18b,0.502942,1.0,0.314205,0.330855,0.360421,0.30794,0.215072,0.309945,0.198167,0.108806,0.239831,0.146517,0.1121,0.085994,-0.021274,-0.02145,-0.038116,-0.037344,-0.035215,-0.034654
d18c,0.399819,0.314205,1.0,0.567907,0.266619,0.336214,0.506958,0.308876,0.255635,0.053498,0.41585,0.058577,-0.155244,0.405073,-0.097927,-0.092045,-0.079088,-0.073765,-0.084501,-0.084576
d18d,0.381158,0.330855,0.567907,1.0,0.498352,0.500102,0.425125,0.324382,0.283859,0.193649,0.274317,0.104074,0.016544,0.602162,-0.070036,-0.067274,-0.063382,-0.059543,-0.072057,-0.069867
d18e,0.298599,0.360421,0.266619,0.498352,1.0,0.523882,0.226632,0.207662,0.256962,0.288656,0.165451,0.222188,0.196835,0.342,-0.076136,-0.07286,-0.070042,-0.067046,-0.094886,-0.090735
d18f,0.350784,0.30794,0.336214,0.500102,0.523882,1.0,0.343096,0.256841,0.290415,0.236672,0.193134,0.170539,0.161817,0.430113,-0.073098,-0.071925,-0.058719,-0.056647,-0.097713,-0.091989
d18g,0.302043,0.215072,0.506958,0.425125,0.226632,0.343096,1.0,0.333735,0.214021,0.088527,0.40201,0.043236,-0.05149,0.310493,-0.072912,-0.068909,-0.056596,-0.05064,-0.070845,-0.074086
d18h,0.302301,0.309945,0.308876,0.324382,0.207662,0.256841,0.333735,1.0,0.346712,0.197818,0.265642,0.201032,0.123274,0.301842,-0.045495,-0.042108,-0.048116,-0.042615,-0.113289,-0.107902
d18i,0.207602,0.198167,0.255635,0.283859,0.256962,0.290415,0.214021,0.346712,1.0,0.22492,0.15155,0.164283,0.184324,0.26373,-0.037962,-0.036836,-0.033775,-0.031918,-0.078377,-0.072431
d18j,0.182805,0.108806,0.053498,0.193649,0.288656,0.236672,0.088527,0.197818,0.22492,1.0,0.157154,0.450995,0.414564,0.273744,-0.074533,-0.072484,-0.074218,-0.072291,-0.112656,-0.104517


## d19a, ..., d19r

In [11]:
column_number = 19
column_letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r"]
current_columns = [f"d{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['d19a', 'd19b', 'd19c', 'd19d', 'd19e', 'd19f', 'd19g', 'd19h', 'd19i', 'd19j', 'd19k', 'd19l', 'd19m', 'd19n', 'd19o', 'd19p', 'd19q', 'd19r']
MISSING VALUES
d19a    10.350955
d19b    10.391500
d19c    10.575146
d19d    10.672931
d19e    10.709899
d19f    10.602573
d19g    10.308024
d19h    10.311602
d19i    10.547718
d19j    10.558451
d19k    10.669354
d19l    10.903085
d19m    10.669354
d19n    10.693204
d19o    10.478553
d19p    10.478553
d19q    10.500018
d19r    10.542948
dtype: float64


Unnamed: 0,d19a,d19b,d19c,d19d,d19e,d19f,d19g,d19h,d19i,d19j,d19k,d19l,d19m,d19n,d19o,d19p,d19q,d19r,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
d19a,1.0,0.755526,0.612054,0.634713,0.621565,0.621884,0.65336,0.663168,0.642138,0.654994,0.626522,0.638456,0.610875,0.616494,0.462081,0.593621,0.625956,0.611038,-0.036862,-0.037989,-0.034643,-0.034796,-0.025899,-0.023003
d19b,0.755526,1.0,0.67763,0.738208,0.697091,0.702673,0.732011,0.771573,0.765813,0.747338,0.742355,0.73861,0.715431,0.734526,0.511204,0.630841,0.722603,0.694974,-0.04166,-0.043754,-0.043365,-0.041913,-0.036488,-0.033681
d19c,0.612054,0.67763,1.0,0.614062,0.765459,0.688447,0.71844,0.758286,0.659448,0.590267,0.570213,0.599988,0.56524,0.562994,0.551442,0.608303,0.64839,0.619473,-0.068506,-0.069877,-0.072291,-0.070563,-0.061453,-0.056296
d19d,0.634713,0.738208,0.614062,1.0,0.67631,0.74052,0.808923,0.801631,0.894966,0.884882,0.867999,0.895552,0.900784,0.913645,0.394221,0.64759,0.783682,0.755067,-0.012517,-0.016152,-0.016688,-0.016896,-0.022984,-0.019074
d19e,0.621565,0.697091,0.765459,0.67631,1.0,0.761148,0.770942,0.786822,0.718989,0.647563,0.63133,0.66247,0.637278,0.631011,0.530032,0.64757,0.71716,0.693843,-0.057587,-0.058454,-0.063134,-0.061202,-0.033268,-0.030963
d19f,0.621884,0.702673,0.688447,0.74052,0.761148,1.0,0.800457,0.809403,0.78391,0.726639,0.697975,0.72534,0.713951,0.714311,0.474873,0.643652,0.733333,0.71758,-0.030202,-0.031183,-0.034521,-0.033922,-0.03798,-0.034896
d19g,0.65336,0.732011,0.71844,0.808923,0.770942,0.800457,1.0,0.845252,0.845835,0.793587,0.777825,0.807551,0.790865,0.800407,0.501285,0.696026,0.79002,0.782341,-0.044071,-0.045742,-0.050885,-0.048187,-0.021647,-0.019755
d19h,0.663168,0.771573,0.758286,0.801631,0.786822,0.809403,0.845252,1.0,0.878051,0.7962,0.777976,0.808202,0.788912,0.790883,0.518937,0.676252,0.798597,0.753364,-0.037442,-0.038894,-0.04796,-0.045062,-0.037183,-0.03302
d19i,0.642138,0.765813,0.659448,0.894966,0.718989,0.78391,0.845835,0.878051,1.0,0.883819,0.878711,0.898588,0.897306,0.908441,0.426494,0.67648,0.828203,0.789839,-0.012756,-0.016218,-0.021758,-0.021036,-0.020112,-0.016155
d19j,0.654994,0.747338,0.590267,0.884882,0.647563,0.726639,0.793587,0.7962,0.883819,1.0,0.923415,0.921436,0.920839,0.926296,0.396364,0.65773,0.784122,0.737821,-0.000664,-0.004249,-0.014243,-0.014084,-0.020561,-0.01743


## d20a, ..., d20l

In [12]:
column_number = 20
column_letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]
current_columns = [f"d{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['d20a', 'd20b', 'd20c', 'd20d', 'd20e', 'd20f', 'd20g', 'd20h', 'd20i', 'd20j', 'd20k', 'd20l']
MISSING VALUES
d20a    10.407002
d20b    10.285367
d20c    10.449933
d20d    10.519098
d20e    10.578723
d20f    10.404617
d20g    10.374805
d20h    55.339447
d20i    55.339447
d20j    55.339447
d20k    55.401457
d20l    55.339447
dtype: float64


Unnamed: 0,d20a,d20b,d20c,d20d,d20e,d20f,d20g,d20h,d20i,d20j,d20k,d20l,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
d20a,1.0,0.419394,0.387971,0.393508,0.413629,0.374262,0.359689,0.319968,0.306045,0.335142,0.322803,0.323179,0.029022,0.026282,0.04277,0.042388,0.036342,0.037767
d20b,0.419394,1.0,0.375163,0.329675,0.358972,0.338635,0.28217,0.305556,0.198616,0.208552,0.353877,0.293412,-0.012232,-0.012188,0.007313,0.008193,-0.020797,-0.019132
d20c,0.387971,0.375163,1.0,0.486241,0.543739,0.505115,0.497893,0.363116,0.340276,0.437335,0.251687,0.345804,0.021399,0.01921,0.031317,0.032857,0.020621,0.02178
d20d,0.393508,0.329675,0.486241,1.0,0.522014,0.498636,0.401961,0.442057,0.448041,0.479704,0.311117,0.427409,0.057035,0.053459,0.077393,0.076054,0.057493,0.054233
d20e,0.413629,0.358972,0.543739,0.522014,1.0,0.596055,0.501905,0.345902,0.400033,0.475159,0.299004,0.402048,0.032375,0.029509,0.045437,0.043744,0.053084,0.050625
d20f,0.374262,0.338635,0.505115,0.498636,0.596055,1.0,0.464069,0.420739,0.437688,0.448398,0.337733,0.404475,0.01635,0.014878,0.030475,0.028753,0.03774,0.035785
d20g,0.359689,0.28217,0.497893,0.401961,0.501905,0.464069,1.0,0.359339,0.376011,0.467351,0.196738,0.32532,0.009754,0.007765,0.039004,0.038098,0.035434,0.034763
d20h,0.319968,0.305556,0.363116,0.442057,0.345902,0.420739,0.359339,1.0,0.555609,0.423164,0.388277,0.420402,-0.007129,-0.011911,0.028153,0.029885,0.021121,0.023081
d20i,0.306045,0.198616,0.340276,0.448041,0.400033,0.437688,0.376011,0.555609,1.0,0.527069,0.295403,0.447815,0.046375,0.038734,0.055299,0.056005,0.079619,0.079028
d20j,0.335142,0.208552,0.437335,0.479704,0.475159,0.448398,0.467351,0.423164,0.527069,1.0,0.238173,0.453959,0.043372,0.037029,0.053223,0.051018,0.072668,0.07095


## d21a, ..., d21f

In [13]:
column_number = 21
column_letters = ["a", "b", "c", "d", "e", "f"]
current_columns = [f"d{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['d21a', 'd21b', 'd21c', 'd21d', 'd21e', 'd21f']
MISSING VALUES
d21a    10.380767
d21b    10.421312
d21c    10.395077
d21d    10.533408
d21e    10.440393
d21f    10.360495
dtype: float64


Unnamed: 0,d21a,d21b,d21c,d21d,d21e,d21f,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
d21a,1.0,0.468456,0.369152,0.101979,0.231595,0.315632,0.074914,0.071787,0.064574,0.062249,0.105778,0.102108
d21b,0.468456,1.0,0.514982,0.128714,0.135743,0.220954,0.081215,0.075875,0.07681,0.076709,0.093335,0.090406
d21c,0.369152,0.514982,1.0,0.112157,0.150888,0.193498,0.111716,0.106077,0.099239,0.095895,0.149071,0.143139
d21d,0.101979,0.128714,0.112157,1.0,0.432506,0.315844,-0.085879,-0.08209,-0.045031,-0.044053,-0.083805,-0.082035
d21e,0.231595,0.135743,0.150888,0.432506,1.0,0.346838,-0.062749,-0.058774,-0.035681,-0.034435,-0.032974,-0.033487
d21f,0.315632,0.220954,0.193498,0.315844,0.346838,1.0,0.003632,0.005159,0.019525,0.015921,0.020743,0.020065
score_MAT,0.074914,0.081215,0.111716,-0.085879,-0.062749,0.003632,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.071787,0.075875,0.106077,-0.08209,-0.058774,0.005159,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.064574,0.07681,0.099239,-0.045031,-0.035681,0.019525,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.062249,0.076709,0.095895,-0.044053,-0.034435,0.015921,0.458407,0.439989,0.94985,1.0,0.481747,0.46406


## d22a, ..., d22f

In [14]:
column_number = 22
column_letters = ["a", "b", "c", "d", "e", "f"]
current_columns = [f"d{column_number}{letter}" for letter in column_letters]
print(current_columns)
corr = print_statistics(columns=current_columns)
corr.style.background_gradient(cmap="coolwarm")

['d22a', 'd22b', 'd22c', 'd22d', 'd22e', 'd22f']
MISSING VALUES
d22a    10.269864
d22b    10.186389
d22c    10.182811
d22d    10.182811
d22e    10.395077
d22f    10.410580
dtype: float64


Unnamed: 0,d22a,d22b,d22c,d22d,d22e,d22f,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
d22a,1.0,0.233313,0.27738,0.379327,0.490701,0.306995,0.110676,0.103819,0.099381,0.097516,0.136239,0.129839
d22b,0.233313,1.0,0.247114,0.240423,0.278644,0.151597,0.029272,0.025019,0.050917,0.049164,0.042397,0.039527
d22c,0.27738,0.247114,1.0,0.481844,0.416097,0.316976,0.007786,0.00731,0.026189,0.024784,0.022277,0.022124
d22d,0.379327,0.240423,0.481844,1.0,0.608995,0.412639,0.033621,0.031306,0.041892,0.039732,0.046394,0.046099
d22e,0.490701,0.278644,0.416097,0.608995,1.0,0.365556,0.061873,0.056815,0.056357,0.05532,0.079717,0.076395
d22f,0.306995,0.151597,0.316976,0.412639,0.365556,1.0,0.091285,0.086446,0.072142,0.070785,0.125446,0.120302
score_MAT,0.110676,0.029272,0.007786,0.033621,0.061873,0.091285,1.0,0.949125,0.478365,0.458407,0.457797,0.435502
level_MAT,0.103819,0.025019,0.00731,0.031306,0.056815,0.086446,0.949125,1.0,0.456791,0.439989,0.435458,0.416475
score_LEN,0.099381,0.050917,0.026189,0.041892,0.056357,0.072142,0.478365,0.456791,1.0,0.94985,0.503339,0.482328
level_LEN,0.097516,0.049164,0.024784,0.039732,0.05532,0.070785,0.458407,0.439989,0.94985,1.0,0.481747,0.46406


## d30, ...

In [17]:
col = "d307"
print("OLD")
print(df[col].value_counts())
print(f"""Missing: {df[col].isna().sum()}""")
print()
print("NEW")
new_df = df[col].replace(2, 0).replace(np.nan, 0).astype("boolean")
print(new_df.value_counts())
print(f"Missing: {new_df.isna().sum()}")
new_df

OLD
d307
1.0    8916
Name: count, dtype: int64
Missing: 74941

NEW
d307
False    74941
True      8916
Name: count, dtype: Int64
Missing: 0


id_student
1        False
2        False
3        False
4        False
5        False
         ...  
83853    False
83854     True
83855    False
83856    False
83857    False
Name: d307, Length: 83857, dtype: boolean

## d32a

In [18]:
col = "d32a"
print("OLD")
print(df[col].value_counts())
print(f"""Missing: {df[col].isna().sum()}""")
print()
print("NEW")
new_df = df[col].apply(lambda x: 0 if x == 2 else x).astype("boolean")
print(new_df.value_counts())
print(f"Missing: {new_df.isna().sum()}")
new_df

OLD
d32a
1.0    33485
2.0     3966
Name: count, dtype: int64
Missing: 46406

NEW
d32a
True     33485
False     3966
Name: count, dtype: Int64
Missing: 46406


id_student
1         <NA>
2         <NA>
3         <NA>
4         <NA>
5         <NA>
         ...  
83853     True
83854     True
83855    False
83856     True
83857     True
Name: d32a, Length: 83857, dtype: boolean

## groups

In [23]:
df["groups"].max()

6.0

## island

In [26]:
col = "island"
print("OLD")
print(df[col].value_counts())
print(f"""Missing: {df[col].isna().sum()}""")
print()
print("NEW")
new_df = (
    df[col]
    .apply(
        lambda x: (
            "GRAN_CANARIA_CAPITAL"
            if x == 1
            else (
                "GRAN_CANARIA_PROVINCE"
                if x == 2
                else (
                    "TENERIFE_PROVINCE"
                    if x == 3
                    else ("TENERIFE_CAPITAL" if x == 4 else np.nan)
                )
            )
        )
    )
)
print(new_df.value_counts())
print(f"Missing: {new_df.isna().sum()}")
new_df

OLD
island
4.0    36396
1.0    32797
2.0    11146
3.0     2695
Name: count, dtype: int64
Missing: 823

NEW
island
TENERIFE_CAPITAL         36396
GRAN_CANARIA_CAPITAL     32797
GRAN_CANARIA_PROVINCE    11146
TENERIFE_PROVINCE         2695
Name: count, dtype: int64
Missing: 823


id_student
1            TENERIFE_CAPITAL
2        GRAN_CANARIA_CAPITAL
3        GRAN_CANARIA_CAPITAL
4            TENERIFE_CAPITAL
5            TENERIFE_CAPITAL
                 ...         
83853    GRAN_CANARIA_CAPITAL
83854    GRAN_CANARIA_CAPITAL
83855        TENERIFE_CAPITAL
83856    GRAN_CANARIA_CAPITAL
83857    GRAN_CANARIA_CAPITAL
Name: island, Length: 83857, dtype: object

## public_private

In [27]:
col = "public_private"
print("OLD")
print(df[col].value_counts())
print(f"""Missing: {df[col].isna().sum()}""")
print()
print("NEW")
new_df = df[col].apply(
    lambda x: ("PRIVATE" if x == 1 else ("PUBLIC" if x == 2 else np.nan))
)
print(new_df.value_counts())
print(f"Missing: {new_df.isna().sum()}")
new_df

OLD
public_private
2.0    62959
1.0    20075
Name: count, dtype: int64
Missing: 823

NEW
public_private
PUBLIC     62959
PRIVATE    20075
Name: count, dtype: int64
Missing: 823


id_student
1         PUBLIC
2         PUBLIC
3        PRIVATE
4         PUBLIC
5        PRIVATE
          ...   
83853     PUBLIC
83854    PRIVATE
83855     PUBLIC
83856     PUBLIC
83857     PUBLIC
Name: public_private, Length: 83857, dtype: object