# Imports

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree

from src.pre_processing.macros import column_groups, DATA_PATH, DATA_SPLIT_PATH, ORIGINAL_DATASET_NAME

# Loading

In [2]:
df = pd.read_csv(
    os.path.join(DATA_SPLIT_PATH, "teacher_questionnaire.csv"), low_memory=False
)
df = df.set_index("id_student")
df

Unnamed: 0_level_0,p2,p2n,p3n,p4n,p5,p6n,p7an,p7bn,p7cn,p7dn,...,p331a,p331b,p331c,p331d,p331e,p331f,p331g,p331j,pfc,rep
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,2.0,56.0,29.0,18.0,2.0,26.0,,,1.0,,...,2.0,2.0,3.0,2.0,3.0,3.0,,,,
5,2.0,44.0,20.0,17.0,2.0,26.0,0.0,0.0,0.0,0.0,...,4.0,,4.0,4.0,4.0,3.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2.0,34.0,1.0,1.0,,,3.0,2.0,1.0,1.0,...,4.0,4.0,4.0,4.0,4.0,,5.0,3.0,10.0,2.0
83854,1.0,54.0,30.0,30.0,,,0.0,0.0,0.0,0.0,...,3.0,4.0,4.0,4.0,4.0,,5.0,4.0,3.0,2.0
83855,,,,,,,,,,,...,,,,,,,,,,
83856,2.0,56.0,22.0,4.0,,,1.0,1.0,1.0,0.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,8.0,1.0


In [3]:
# Load identifiers and change float columns to int
ids = pd.read_csv(os.path.join(DATA_SPLIT_PATH, "identifiers.csv"), low_memory=False)
ids = ids.set_index("id_student")
int_identifiers = [col for col in ids.columns if col not in ["id_class_group"]]
ids[int_identifiers] = ids[int_identifiers].astype("Int64")
ids

Unnamed: 0_level_0,id_student_original,id_year,id_grade,id_class_group,id_school,id_student_16_19,id_school_16_19
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,33613,2016,3,A,2415,14374,569
2,19294,2016,3,A,1842,8001,273
3,19587,2016,3,,1432,8142,82
4,29985,2016,3,A,2280,12800,505
5,6982,2016,3,A,2040,2606,390
...,...,...,...,...,...,...,...
83853,2500,2019,6,B,1278,,271
83854,5784,2019,6,A,1079,,78
83855,7708,2019,6,X,1486,3036,459
83856,18965,2019,6,B,1036,9913,36


In [4]:
# Load identifiers and change float columns to int
scores = pd.read_csv(
    os.path.join(DATA_SPLIT_PATH, "student_scores.csv"), low_memory=False
)
scores = scores.set_index("id_student")
# int_identifiers = [col for col in ids.columns if col not in ["id_class_group"]]
# ids[int_identifiers] = ids[int_identifiers].astype('Int64')
scores

Unnamed: 0_level_0,score_MAT,level_MAT,score_LEN,level_LEN,score_ING,level_ING
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,564.8700,3.0,535.1500,3.0,,
2,388.3400,1.0,293.7000,1.0,,
3,386.5900,1.0,514.8100,3.0,,
4,487.7600,2.0,449.2500,2.0,,
5,709.7900,4.0,598.7200,3.0,,
...,...,...,...,...,...,...
83853,400.8625,2.0,446.6522,2.0,294.7474,1.0
83854,597.0243,3.0,632.6043,4.0,633.2966,4.0
83855,707.9254,4.0,400.2761,2.0,477.5056,2.0
83856,522.8511,3.0,656.1601,4.0,540.1122,3.0


In [5]:
ids = pd.merge(ids, scores, left_index=True, right_index=True)
df = pd.merge(ids, df, left_index=True, right_index=True)
df

Unnamed: 0_level_0,id_student_original,id_year,id_grade,id_class_group,id_school,id_student_16_19,id_school_16_19,score_MAT,level_MAT,score_LEN,...,p331a,p331b,p331c,p331d,p331e,p331f,p331g,p331j,pfc,rep
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,33613,2016,3,A,2415,14374,569,564.8700,3.0,535.1500,...,,,,,,,,,,
2,19294,2016,3,A,1842,8001,273,388.3400,1.0,293.7000,...,,,,,,,,,,
3,19587,2016,3,,1432,8142,82,386.5900,1.0,514.8100,...,,,,,,,,,,
4,29985,2016,3,A,2280,12800,505,487.7600,2.0,449.2500,...,2.0,2.0,3.0,2.0,3.0,3.0,,,,
5,6982,2016,3,A,2040,2606,390,709.7900,4.0,598.7200,...,4.0,,4.0,4.0,4.0,3.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83853,2500,2019,6,B,1278,,271,400.8625,2.0,446.6522,...,4.0,4.0,4.0,4.0,4.0,,5.0,3.0,10.0,2.0
83854,5784,2019,6,A,1079,,78,597.0243,3.0,632.6043,...,3.0,4.0,4.0,4.0,4.0,,5.0,4.0,3.0,2.0
83855,7708,2019,6,X,1486,3036,459,707.9254,4.0,400.2761,...,,,,,,,,,,
83856,18965,2019,6,B,1036,9913,36,522.8511,3.0,656.1601,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,8.0,1.0


# Analysis

## Utils

In [6]:
def print_statistics(columns):
    print("MISSING VALUES")
    # print(df[columns].isna().sum())
    # print()
    # print("Percentage")
    print(df[columns].isna().sum() / df.shape[0] * 100)
    # print()
    # print("DISTINCT VALUES")
    # print(df[columns].value_counts())
    # print()

    corr = df[
        columns
        + [
            "score_MAT",
            "level_MAT",
            "score_LEN",
            "level_LEN",
            "score_ING",
            "level_ING",
        ]
    ].corr()
    
    return corr

## gender

In [8]:
col = "p2"
print("OLD")
print(df[col].value_counts())
print(f"""Missing: {df[col].isna().sum()}""")
print()
print("NEW")
new_df = df[col].apply(
    lambda x: "MALE" if x == 1 else ("FEMALE" if x == 2 else np.nan)
)
print(new_df.value_counts())
print(f"Missing: {new_df.isna().sum()}")
new_df

OLD
p2
2.0    34092
1.0    13829
Name: count, dtype: int64
Missing: 35936

NEW
p2
FEMALE    34092
MALE      13829
Name: count, dtype: int64
Missing: 35936


id_student
1           NaN
2           NaN
3           NaN
4        FEMALE
5        FEMALE
          ...  
83853    FEMALE
83854      MALE
83855       NaN
83856    FEMALE
83857      MALE
Name: p2, Length: 83857, dtype: object

## average_explanation_time

In [13]:
df["p11"].value_counts()

p11
2.0    21525
1.0    10008
3.0      821
0.0      100
Name: count, dtype: int64