In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px

In [3]:
# Load data ## "sii" is the target variable
train_data = pd.read_csv("Z:/kaggle/child-mind-institute-problematic-internet-use/train.csv") # (3960, 82)
# Separate labeled and unlabeled data
labeled_train_data = train_data.dropna(subset=['sii']) # (2736, 82)
unlabeled_train_data = train_data[train_data['sii'].isna()] # (1224, 82)
X_labeled = labeled_train_data.drop('sii', axis=1) # (2736, 81)
y_labeled = labeled_train_data['sii'] # 2736,
X_unlabeled = unlabeled_train_data.drop('sii', axis=1) # (1224, 81)

In [4]:
#data_imputed_df.shape
train_data["sii"].isna().sum() # sii is missed in 1224 out of 3960 samples

1224

In [5]:
missing_values_count = X_labeled.isna().sum().sort_values(ascending=False)
total_missing = missing_values_count.sum()
print("\nTotal number of missing values in all columns:", total_missing)
print("Number of missing values in each column:")
print(missing_values_count)


Total number of missing values in all columns: 51809
Number of missing values in each column:
PAQ_A-PAQ_A_Total               2373
PAQ_A-Season                    2373
Physical-Waist_Circumference    2253
Fitness_Endurance-Time_Sec      2008
Fitness_Endurance-Time_Mins     2008
                                ... 
Basic_Demos-Age                    0
PCIAT-Season                       0
Basic_Demos-Enroll_Season          0
PCIAT-PCIAT_Total                  0
id                                 0
Length: 81, dtype: int64


In [6]:
threshold = 0.8 * len(X_labeled)

# Filter out columns with more than 80% missing values
X_labeled_reduced = X_labeled.loc[:, X_labeled.isna().sum() <= threshold] # (2736, 78)

missing_values_count = X_labeled_reduced.isna().sum().sort_values(ascending=False)
total_missing = missing_values_count.sum()
print("\nTotal number of missing values in all columns:", total_missing)

print("Dataframe after removing columns with more than 80% missing values:") # 3 columns were removed (1 of which categorical)
print(X_labeled_reduced)


Total number of missing values in all columns: 44810
Dataframe after removing columns with more than 80% missing values:
            id Basic_Demos-Enroll_Season  Basic_Demos-Age  Basic_Demos-Sex  \
0     00008ff9                      Fall                5                0   
1     000fd460                    Summer                9                0   
2     00105258                    Summer               10                1   
3     00115b9f                    Winter                9                0   
5     001f3379                    Spring               13                1   
...        ...                       ...              ...              ...   
3953  ff6c2bb8                      Fall                8                0   
3954  ff759544                    Summer                7                1   
3955  ff8a2de4                      Fall               13                0   
3957  ffcd4dbd                      Fall               11                0   
3958  ffed1dd5      

In [10]:
print(cat_features)

['id', 'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'PCIAT-Season', 'SDS-Season', 'PreInt_EduHx-Season']


In [13]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Separate numerical and categorical features
num_features = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = train_data.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessing with OneHotEncoder and StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None), cat_features)
    ]
)

# Create a pipeline ##  i swapped imputer line with preprocessor line it to see if it works or not, but it did not work (can not convert string to float error)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', KNNImputer(n_neighbors=5, keep_empty_features=True))
])

data_imputed = pipeline.fit_transform(train_data)  

new_column_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

data_imputed_df = pd.DataFrame(data_imputed, columns=new_column_names) # (3960, 4084)
print("Imputed data:")
print(data_imputed_df.head())

Imputed data:
   num__Basic_Demos-Age  num__Basic_Demos-Sex  num__CGAS-CGAS_Score  \
0             -1.520226             -0.770846             -0.647115   
1             -0.401093             -0.770846             -0.020359   
2             -0.121310              1.297277              0.248250   
3             -0.401093             -0.770846              0.248250   
4              2.116955              1.297277             -0.512810   

   num__Physical-BMI  num__Physical-Height  num__Physical-Weight  \
0          -0.480065             -1.331104             -0.858103   
1          -1.035839             -1.063457             -0.965819   
2          -0.524777              0.074043             -0.301573   
3          -0.203318              0.007131             -0.166928   
4           2.257937              0.901072              1.922754   

   num__Physical-Waist_Circumference  num__Physical-Diastolic_BP  \
0                          -0.643132                   -0.238737   
1             

In [18]:
print(new_column_names)
type(new_column_names)

['num__Basic_Demos-Age' 'num__Basic_Demos-Sex' 'num__CGAS-CGAS_Score' ...
 'cat__PreInt_EduHx-Season_Summer' 'cat__PreInt_EduHx-Season_Winter'
 'cat__PreInt_EduHx-Season_nan']


numpy.ndarray

In [7]:
missing_values_count = data_imputed_df.isna().sum().sort_values(ascending=False) # check, no more missing values in train data
total_missing = missing_values_count.sum()
print("\nTotal number of missing values in all columns:", total_missing)


Total number of missing values in all columns: 0


In [8]:
#data_imputed.head()
columns_list = data_imputed_df.columns.tolist()
print("List of all columns:", columns_list)
"sii" in columns_list

List of all columns: ['num__Basic_Demos-Age', 'num__Basic_Demos-Sex', 'num__CGAS-CGAS_Score', 'num__Physical-BMI', 'num__Physical-Height', 'num__Physical-Weight', 'num__Physical-Waist_Circumference', 'num__Physical-Diastolic_BP', 'num__Physical-HeartRate', 'num__Physical-Systolic_BP', 'num__Fitness_Endurance-Max_Stage', 'num__Fitness_Endurance-Time_Mins', 'num__Fitness_Endurance-Time_Sec', 'num__FGC-FGC_CU', 'num__FGC-FGC_CU_Zone', 'num__FGC-FGC_GSND', 'num__FGC-FGC_GSND_Zone', 'num__FGC-FGC_GSD', 'num__FGC-FGC_GSD_Zone', 'num__FGC-FGC_PU', 'num__FGC-FGC_PU_Zone', 'num__FGC-FGC_SRL', 'num__FGC-FGC_SRL_Zone', 'num__FGC-FGC_SRR', 'num__FGC-FGC_SRR_Zone', 'num__FGC-FGC_TL', 'num__FGC-FGC_TL_Zone', 'num__BIA-BIA_Activity_Level_num', 'num__BIA-BIA_BMC', 'num__BIA-BIA_BMI', 'num__BIA-BIA_BMR', 'num__BIA-BIA_DEE', 'num__BIA-BIA_ECW', 'num__BIA-BIA_FFM', 'num__BIA-BIA_FFMI', 'num__BIA-BIA_FMI', 'num__BIA-BIA_Fat', 'num__BIA-BIA_Frame_num', 'num__BIA-BIA_ICW', 'num__BIA-BIA_LDM', 'num__BIA-BIA_

False