#### Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, confusion_matrix
from imblearn.over_sampling import SMOTE  # for handlinG imbalance if needed

import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df = pd.read_csv('/content/drive/MyDrive/AML/Capstone Group Project/ihm_aki.csv')

In [5]:

# Basic information about the dataset
print("=== DATASET SHAPE ===")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print("\n")

print("=== FIRST 5 ROWS ===")
print(df.head())
print("\n")

print("=== LAST 5 ROWS ===")
print(df.tail())
print("\n")

print("=== COLUMN NAMES ===")
print(df.columns.tolist())
print("\n")

=== DATASET SHAPE ===
Rows: 3550, Columns: 36


=== FIRST 5 ROWS ===
   Unnamed: 0    age  gender_F  gender_M  bic_max  bic_mean  bic_min  \
0           0  74.63         1         0     40.0     34.62     30.0   
1           1  60.12         1         0     34.0     28.94     24.0   
2           2  64.12         1         0     26.0     24.07     21.0   
3           4  54.46         0         1     34.0     30.98     26.0   
4           5  78.22         0         1     29.6     23.10     18.0   

   bilirubin  bp_max  bp_mean  ...  pot_mean  pot_min  sod_max  sod_mean  \
0        0.4  154.63   123.28  ...      3.85     3.30   143.50    141.50   
1        0.2  113.12   104.68  ...      3.76     2.75   145.00    141.21   
2        0.3  126.62   108.91  ...      3.86     3.50   145.00    140.86   
3        1.0  151.38   114.38  ...      4.17     3.60   147.50    140.43   
4        1.0  166.26   144.62  ...      4.10     3.40   150.25    141.22   

   sod_min   temp  wbc_max  wbc_mean  wbc

In [6]:
print("=== DATA TYPES ===")
print(df.dtypes)
print("\n")

print("=== BASIC INFO ===")
df.info()

=== DATA TYPES ===
Unnamed: 0       int64
age            float64
gender_F         int64
gender_M         int64
bic_max        float64
bic_mean       float64
bic_min        float64
bilirubin      float64
bp_max         float64
bp_mean        float64
bp_min         float64
bun_max        float64
bun_mean       float64
bun_min        float64
Days_in_uci    float64
fio2           float64
gcs_max        float64
gcs_mean       float64
gcs_min        float64
hr_max         float64
hr_mean        float64
hr_min         float64
max pao2       float64
mean pao2      float64
min pao2       float64
pot_max        float64
pot_mean       float64
pot_min        float64
sod_max        float64
sod_mean       float64
sod_min        float64
temp           float64
wbc_max        float64
wbc_mean       float64
wbc_min        float64
IHM              int64
dtype: object


=== BASIC INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3550 entries, 0 to 3549
Data columns (total 36 columns):
 #   Column

In [7]:
print("=== MISSING VALUES ===")
missing_data = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_info = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
print(missing_info[missing_info['Missing Count'] > 0])
print("\n")

print("=== TOTAL MISSING VALUES ===")
print(f"Total missing values in dataset: {df.isnull().sum().sum()}")

=== MISSING VALUES ===
           Missing Count  Missing Percentage
bilirubin            657           18.507042
fio2                 398           11.211268
max pao2             133            3.746479
mean pao2            133            3.746479
min pao2             133            3.746479
temp                1225           34.507042


=== TOTAL MISSING VALUES ===
Total missing values in dataset: 2679


In [8]:
print("=== DESCRIPTIVE STATISTICS FOR NUMERICAL VARIABLES ===")
print(df.describe())

=== DESCRIPTIVE STATISTICS FOR NUMERICAL VARIABLES ===
        Unnamed: 0          age     gender_F     gender_M      bic_max  \
count  3550.000000  3550.000000  3550.000000  3550.000000  3550.000000   
mean   2220.038028    64.286620     0.424507     0.575493    30.055363   
std    1279.998208    15.986639     0.494338     0.494338     5.032318   
min       0.000000    16.900000     0.000000     0.000000    14.500000   
25%    1108.250000    54.112500     0.000000     0.000000    27.000000   
50%    2220.500000    66.920000     0.000000     1.000000    30.000000   
75%    3335.750000    77.207500     1.000000     1.000000    33.000000   
max    4430.000000    89.000000     1.000000     1.000000    50.000000   

          bic_mean      bic_min    bilirubin       bp_max      bp_mean  ...  \
count  3550.000000  3550.000000  2893.000000  3550.000000  3550.000000  ...   
mean     25.606231    20.895972     1.586049   140.153299   122.734980  ...   
std       4.147980     4.388776     3.168