In [206]:
import pandas as pd
import matplotlib.pyplot as plt

class DataSet:

    def __init__(self, train=None, test=None):
        self.df_train = pd.read_csv(train)
        self.df_test = pd.read_csv(test)

    def print_summary_statics(self):
        describe_df = self.df_train.describe()
        # print(describe_df)
        for i in range(0, len(describe_df.columns), 2):
            print(describe_df.loc[:, describe_df.columns[i:i+2]])

    def plot_summary_statics(self):
        notna_counts = self.df_train.notna().sum()
        self.show_plot(notna_counts)

    def print_summary_statics_without_notna(self, attributes):
        final_cols = []
        for attribute in attributes:
            columns = [col for col in self.df_train.columns if col.startswith(attribute)]
            final_cols += columns
        df_new = self.df_train.dropna(subset=final_cols)
        # df_new = df_new.dropna(axis=1, how='all')
        describe_df = df_new.describe()
        # print(describe_df)
        for i in range(0, len(describe_df.columns), 2):
            print(describe_df.loc[:, describe_df.columns[i:i+2]])

    def plot_summary_statics_without_notna(self, attributes):
        final_cols = []
        for attribute in attributes:
            columns = [col for col in self.df_train.columns if col.startswith(attribute)]
            final_cols += columns
        df_new = self.df_train.dropna(subset=final_cols)
        # df_new = df_new.dropna(axis=1, how='all')
        notna_counts = df_new.notna().sum()
        self.show_plot(notna_counts)

    def plot_drop(self, attributes):
        final_cols = []
        for attribute in attributes:
            columns = [col for col in self.df_train.columns if col.startswith(attribute)]
            final_cols += columns
        df_new = self.df_train.drop(columns=final_cols, errors='ignore')
        notna_counts = df_new.notna().sum()
        self.show_plot(notna_counts)
    
    def show_plot(self, df):
        plt.figure(figsize=(10, 6))
        df.plot(kind='bar', color='skyblue', edgecolor='black')
        plt.title('Non-NA Counts per Column in Filtered DataFrame')
        plt.xlabel('Columns')
        plt.ylabel('Non-NA Counts')
        plt.xticks(rotation=90, fontsize=10)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()

        



In [221]:
def data_clean(train):
    pass
    df_train = pd.read_csv(train)
    df_sii = df_train[df_train['sii'].notna()]
    df_sii = df_sii.copy()
    df_sii['PAQ_Result'] = df_sii['PAQ_A-PAQ_A_Total'].combine_first(df_sii['PAQ_C-PAQ_C_Total'])
    df_sii['PAQ_Season'] = df_sii['PAQ_A-Season'].combine_first(df_sii['PAQ_C-Season'])
    columns_to_drop = ['Physical-Waist_Circumference'] + \
                  [f'PCIAT-PCIAT_{i:02d}' for i in range(1, 21)] + \
                  ['PCIAT-PCIAT_Total'] + \
                  ['PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total', 'PAQ_A-Season', 'PAQ_C-Season']
    
    df_sii = df_sii.drop(columns=columns_to_drop, errors='ignore')
    # drop FGC, Fitness_Endurance
    if 1:
        columns = [col for col in df_sii.columns if col.startswith('FGC')] + [col for col in df_sii.columns if col.startswith('Fitness_Endurance')]
        df_sii = df_sii.drop(columns=columns, errors='ignore')

    df_sii.to_csv('datasets/train_drop2.csv', index=False)

data_clean('datasets/train.csv')

In [None]:
data = DataSet('datasets/train_clean.csv', 'datasets/test.csv')
data.plot_summary_statics()
data.print_summary_statics()

In [None]:
data2 = DataSet('datasets/train_drop2.csv', 'datasets/test.csv')
data2.plot_summary_statics()
data2.print_summary_statics()

In [None]:
attr = ['Fitness_Endurance']
data.plot_summary_statics_without_notna(attr)
data.print_summary_statics_without_notna(attr)

In [None]:
attr = ['FGC']
data.plot_summary_statics_without_notna(attr)
data.print_summary_statics_without_notna(attr)

In [None]:
attr = ['BIA-BIA']
data.plot_summary_statics_without_notna(attr)
data.print_summary_statics_without_notna(attr)

In [None]:
attr = ['Physical']
data.plot_summary_statics_without_notna(attr)
data.print_summary_statics_without_notna(attr)

In [None]:
attr = ['PAQ_Result']
data.plot_summary_statics_without_notna(attr)
data.print_summary_statics_without_notna(attr)

In [None]:
attr = ['PreInt_EduHx']
data.plot_summary_statics_without_notna(attr)
data.print_summary_statics_without_notna(attr)

In [None]:
attr = ['SDS-SDS']
data.plot_summary_statics_without_notna(attr)
data.print_summary_statics_without_notna(attr)

In [None]:
attr = ['Fitness_Endurance', 'FGC']
data.plot_drop(attr)