In [1]:
import pandas as pd
import numpy as np
import os
from src.pre_processing.macros import (
    DATA_PATH,
    DATA_PREPROC_PATH,
    ORIGINAL_DATASET_NAME,
)


## Missing values in preprocessed data

In [2]:
df_merged = pd.read_csv(os.path.join(DATA_PREPROC_PATH, "merged.csv"), low_memory=False)
df_merged

Unnamed: 0,id_student,gender_x,repeat,skip,homeworks,a13e,a17h,frequency_of_computer,frequency_of_internet,work_with_teachers,...,agreement_of_work_facilitated_by_management,extent_of_positive_relationships,number_of_special_attention_students,extent_of_student_involvement_during_class,extent_of_teaching_methods_variety,agreement_of_opinion_on_school,agreement_of_class_behaviour,extent_of_resource_variety,extent_of_good_work_by_non_teachers,number_of_individual_training_topics
0,1,2.0,1.0,4.0,0.0,4.0,3.0,1.333333,1.333333,2.916667,...,,,,,,,,,,
1,2,1.0,3.0,4.0,3.0,4.0,4.0,2.000000,3.000000,2.833333,...,,,,,,,,,,
2,3,2.0,1.0,4.0,2.0,,3.0,3.000000,3.000000,3.666667,...,,,,,,,,,,
3,4,1.0,1.0,4.0,3.0,4.0,3.0,2.000000,1.833333,2.333333,...,3.000000,2.857143,1.000000,2.666667,3.000000,3.50,2.75,2.375,2.5,
4,5,2.0,1.0,4.0,1.0,4.0,4.0,1.000000,1.333333,2.750000,...,3.857143,36.857143,0.400000,3.000000,2.857143,4.00,3.50,2.875,3.8,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80146,83853,2.0,2.0,4.0,2.0,4.0,4.0,4.000000,2.714286,3.300000,...,3.857143,30.827586,1.166667,2.666667,3.142857,3.75,2.50,2.250,4.0,1.0
80147,83854,2.0,1.0,4.0,4.0,2.0,2.0,1.333333,1.571429,2.700000,...,3.285714,9.896552,0.000000,3.000000,3.142857,3.75,3.00,2.500,4.0,1.0
80148,83855,1.0,1.0,4.0,4.0,3.0,3.0,1.666667,3.000000,3.700000,...,,,,,,,,,,
80149,83856,2.0,1.0,4.0,2.0,4.0,3.0,1.333333,1.857143,2.900000,...,3.000000,4.689655,0.500000,3.000000,3.000000,3.00,3.00,2.375,3.0,2.0


In [3]:
# init ranges
ranges = [(0.0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 1.0)]
range_count = {str(range): 0 for range in ranges}
range_count

{'(0.0, 0.2)': 0, '(0.2, 0.4)': 0, '(0.4, 0.6)': 0, '(0.6, 1.0)': 0}

In [4]:
# update ranges
for col in df_merged.columns:
    nans = df_merged[col].isna().sum() / df_merged.shape[0]
    for range in ranges:
        if nans >= range[0] and nans < range[1]:
            range_count[str(range)] += 1
        if nans == 1.0:
            range_count[str(ranges[-1])] += 1
range_count

{'(0.0, 0.2)': 38, '(0.2, 0.4)': 51, '(0.4, 0.6)': 4, '(0.6, 1.0)': 1}

## Comparison with non preprocessed data

In [5]:
df_orig = pd.read_csv(os.path.join(DATA_PATH, ORIGINAL_DATASET_NAME), low_memory=False)

In [6]:
df_orig

Unnamed: 0,id_student,id_student_original,id_year,id_grade,id_class_group,id_school,id_student_16_19,id_school_16_19,student_questionnaire,principals_questionnaire,...,p331a,p331b,p331c,p331d,p331e,p331f,p331g,p331j,pfc,rep
0,1,33613.0,2016,3,A,2415.0,14374.0,569.0,1,1,...,,,,,,,,,,
1,2,19294.0,2016,3,A,1842.0,8001.0,273.0,1,1,...,,,,,,,,,,
2,3,19587.0,2016,3,,1432.0,8142.0,82.0,1,1,...,,,,,,,,,,
3,4,29985.0,2016,3,A,2280.0,12800.0,505.0,1,1,...,2.0,2.0,3.0,2.0,3.0,3.0,,,,
4,5,6982.0,2016,3,A,2040.0,2606.0,390.0,1,1,...,4.0,,4.0,4.0,4.0,3.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83852,83853,2500.0,2019,6,B,1278.0,,271.0,1,1,...,4.0,4.0,4.0,4.0,4.0,,5.0,3.0,10.0,2.0
83853,83854,5784.0,2019,6,A,1079.0,,78.0,1,1,...,3.0,4.0,4.0,4.0,4.0,,5.0,4.0,3.0,2.0
83854,83855,7708.0,2019,6,X,1486.0,3036.0,459.0,1,1,...,,,,,,,,,,
83855,83856,18965.0,2019,6,B,1036.0,9913.0,36.0,1,1,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,8.0,1.0


In [11]:
nans_orig = np.sum(df_orig.isna().sum())
nans_merged = np.sum(df_merged.isna().sum())
nans_orig, nans_merged

(21225226, 1786454)

In [21]:
size_orig = df_orig.shape[0] * df_orig.shape[1]
size_merged = df_merged.shape[0] * df_merged.shape[1]
print(f"Amount of data reduction: {((size_orig - size_merged) / size_orig) * 100}")
print(f"Proportion of missing values before preprocessing: {(nans_orig/size_orig) * 100}")
print(f"Proportion of missing values after preprocessing: {(nans_merged/size_merged) * 100}")

Amount of data reduction: 83.98471704344658
Proportion of missing values before preprocessing: 45.118031232908876
Proportion of missing values after preprocessing: 23.711282188910985
