In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px 
import seaborn as sns
import unidecode, re
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report, roc_auc_score
from scipy import *
from sklearn.feature_selection import SelectKBest, f_regression
from catboost import CatBoostClassifier
import lightgbm as lgb
import joblib
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

my_font = dict(
            family="Courier New, monospace",
            size=18,
            color="Black",
            variant="small-caps",
) 

df = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")


In [2]:
import subprocess

result = subprocess.run(
    ["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)

if result.returncode == 0:
    print("GPU available ✅")
    print(result.stdout.splitlines()[0])  # prints driver/CUDA version line
else:
    print("GPU not available ❌")
    print(result.stderr)


GPU available ✅
Sun Sep 21 12:10:10 2025       


In [2]:
x = df.isna().sum().reset_index(name='count').sort_values(by='count', ascending=False)
x['pecentage_of_missing_data'] = (x['count'] / len(df))*100
x

Unnamed: 0,index,count,pecentage_of_missing_data
16,OBES_IMC,483795,97.085206
10,PUERPERA,298765,59.954447
18,SIND_DOWN,298085,59.817988
19,HEPATICA,297733,59.747351
13,IMUNODEPRE,295546,59.308476
14,RENAL,292734,58.74418
12,PNEUMOPATI,292402,58.677557
15,OBESIDADE,290862,58.368518
27,FADIGA,268807,53.942647
11,DIABETES,252391,50.648379


## Correlation Matrix

In [3]:
# plt.figure(figsize=(10, 10))
# sns.heatmap(df.corr(numeric_only=True), annot=True, fmt='.1f')

In [28]:
# let us start dropping unecessary stuff
from category_encoders import TargetEncoder

class dataset_cleaner:
    def __init__(self, dataframe):
        self.df = dataframe.copy()
    def dropping_unknown_values_at_rows(self, col, target_value):
        self.df = self.df[self.df[col] != target_value]
        return self.df
    def dropping_useless_columns(self, col_list):
        return self.df.drop(labels=col_list, axis=1)
    def filling_unknown(self, value=9):
        return self.df.fillna(9)


        
df_train_cleaned = df.copy()
df_test_cleaned = df_test.copy()
instance1 = dataset_cleaner(df_train_cleaned)
instance2 = dataset_cleaner(df_test_cleaned)

# dropping CS_SEXO == I. what is even this?? 

df_train_cleaned = instance1.dropping_unknown_values_at_rows('CS_SEXO', 'I')
df_test_cleaned  = instance2.dropping_unknown_values_at_rows('CS_SEXO', 'I')

# dropping useless variables

df_train_cleaned = instance1.dropping_useless_columns(["DT_NOTIFIC", "ID_MN_RESI","OBES_IMC", "CO_RG_RESI"])
df_test_cleaned = instance2.dropping_useless_columns(["DT_NOTIFIC", "ID_MN_RESI","OBES_IMC", "CO_RG_RESI"])

# checking if there is any male with missing PUERPERA - tf is this?? lol 

condition_tr = ((df_train_cleaned['CS_SEXO'] == 'M') & (df_train_cleaned['PUERPERA'].isnull()))
condition_te = ((df_test_cleaned['CS_SEXO'] == 'M') & (df_test_cleaned['PUERPERA'].isnull()))
df_train_cleaned.loc[condition_tr, 'PUERPERA'] = 2
df_test_cleaned.loc[condition_te, 'PUERPERA'] = 2

# Imputing NaN with 9, which is the same as "Unknown"

# df_train_cleaned = instance1.filling_unknown()
# df_test_cleaned = instance2.filling_unknown()


# Input CS_with one hot and input SG_UF with target encoding 

df_encoded_sexo = pd.get_dummies(df_train_cleaned, columns=['CS_SEXO'], drop_first=True)

encoder = TargetEncoder(cols=['SG_UF'])
df_final = encoder.fit_transform(df_encoded_sexo, df_encoded_sexo['EVOLUCAO'])

# imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan)
    
#     # The imputer returns a numpy array, so we'll convert it back to a DataFrame
# df_imputed_array = imputer.fit_transform(df_final)
# df_imputed = pd.DataFrame(df_imputed_array, columns=df.columns)

# df_final.info()



In [None]:

from sklearn.impute import KNNImputer

df_sample = df_final.sample(frac=0.3, random_state=42)


imputer = KNNImputer(n_neighbors=3, weights='uniform')
    
df_imputed_array = imputer.fit_transform(df_sample)


import warnings
warnings.filterwarnings("ignore", message=".*Falling back to prediction using DMatrix.*")

df_imputed = pd.DataFrame(df_imputed_array, columns=df_final.columns)

In [24]:
df_imputed

Unnamed: 0,CS_ZONA,NU_IDADE_N,CS_ESCOL_N,CS_RACA,SG_UF,CS_GESTANT,PUERPERA,DIABETES,PNEUMOPATI,IMUNODEPRE,...,GARGANTA,DESC_RESP,DIARREIA,VOMITO,FADIGA,SATURACAO,DISPNEIA,VACINA,EVOLUCAO,CS_SEXO_M
0,1.000000,87.0,10.394897,9.000000,0.355742,9.0,2.000000,2.000000,2.000000,2.000000,...,2.000000,1.000000,2.000000,2.000000,1.000000,1.000000,1.000000,6.779441,1.0,0.0
1,1.000000,60.0,2.052200,4.000000,0.502853,6.0,2.000000,2.000000,1.000000,2.000000,...,2.000000,1.000000,2.000000,2.000000,1.000000,1.000000,1.000000,3.428367,1.0,1.0
2,0.938047,47.0,9.000000,1.000000,0.355742,6.0,2.000000,1.488548,0.777669,1.208486,...,2.000000,1.000000,2.000000,2.000000,2.000000,1.000000,2.000000,4.537941,0.0,1.0
3,1.000000,58.0,6.815295,6.811543,0.407825,6.0,2.000000,1.000000,1.351343,1.069777,...,0.869474,1.056120,1.064988,0.874789,1.308874,1.000000,0.963201,10.099316,0.0,1.0
4,1.000000,58.0,7.122735,9.000000,0.277765,6.0,2.000000,0.936476,1.079358,1.048597,...,0.977372,1.090853,1.011304,0.975477,1.046827,1.370611,1.000000,7.114674,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149468,0.883305,51.0,2.000000,4.000000,0.337768,6.0,2.000000,0.966447,1.497803,1.100738,...,0.967118,1.000000,0.951068,0.714186,1.043605,0.874058,1.018128,9.000000,1.0,1.0
149469,1.000000,76.0,0.000000,1.000000,0.277765,5.0,2.000000,2.000000,2.000000,2.000000,...,2.000000,1.000000,2.000000,2.000000,2.000000,1.000000,1.000000,1.000000,1.0,0.0
149470,1.000000,58.0,1.000000,4.000000,0.304550,6.0,2.138098,0.987470,0.869311,0.835684,...,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,9.000000,0.0,0.0
149471,1.052597,71.0,1.000000,1.000000,0.277765,5.0,2.000000,1.000000,2.000000,2.000000,...,2.000000,1.000000,2.000000,2.000000,2.000000,1.000000,1.000000,5.280082,1.0,0.0


In [23]:

df_imputed.to_csv('df_imputed_30.csv', index=False)

## Quick insights

In [None]:
df_imputed

In [None]:
#########################  age of those who died #########################
px.box(df_part_clean.groupby(['CS_SEXO', 'NU_IDADE_N', 'EVOLUCAO']).size().reset_index(), x='CS_SEXO', y='NU_IDADE_N', color='EVOLUCAO')
#########################  most common commorbities of those who died ####
df_part_clean.groupby('EVOLUCAO').size()