In this file we perform Exploratory Data Analysis of the  CSV labels of each child

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Information about Dataset

In [None]:
# Load the dataset
df = pd.read_csv('Anthrovision/anthrovision_labels.csv')

# Display first few rows
df.head()

In [None]:
# Basic info and stats
print("Dataset shape:", df.shape)
print("\nColumns in dataset:", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)
print("\nStatistical summary:\n", df.describe())

# Visualize distributions
numeric_cols = ['Height', 'Weight', 'MUAC', 'HC', 'Age', 'BMI', 'BMIz_who', 'wfa_zscore', 'hfa_zscore', 'target_bmi', 'target_bmizscore']

plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_cols):
    plt.subplot(4, 3, i+1)
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(col)
plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation between numeric features")
plt.show()


# Information about Classification Variables

In [None]:
# Binary class counts
binary_counts = df['binary_label'].value_counts()
print("Binary class counts:\n", binary_counts)

# Bar plot for binary classes
plt.figure(figsize=(5,4))
sns.countplot(
    data=df, 
    x='binary_label', 
    color='blue',
    hue = None
)
plt.title("Binary Class Distribution")
plt.show()

# Multiclass counts
multi_counts = df['multiclass_label'].value_counts()
print("\nMulticlass counts:\n", multi_counts)

# Bar plot for multiclass
plt.figure(figsize=(6,4))
sns.countplot(
    data=df, 
    x='multiclass_label', 
    order=multi_counts.index, 
    color='yellow',
    hue=None
)
plt.title("Multiclass Label Distribution")
plt.xticks(rotation=45)
plt.show()

# Age distribution per binary class
plt.figure(figsize=(7,5))
sns.boxplot(
    data=df, 
    x='binary_label', 
    y='Age', 
    palette='Set2',
    legend=False
)
plt.title("Age Distribution per Binary Class")
plt.show()

# Age distribution per multiclass category
plt.figure(figsize=(8,5))
sns.boxplot(
    data=df,
    x='multiclass_label',
    y='Age',
    order=multi_counts.index,
    palette='Set3',
    legend=False
)
plt.title("Age Distribution per Multiclass Category")
plt.xticks(rotation=45)
plt.show()

# Gender distribution per binary class
gender_binary = pd.crosstab(df['binary_label'], df['Gender'])
print("\nGender distribution per binary class:\n", gender_binary)

# Plot
gender_binary.plot(
    kind='bar', 
    stacked=True, 
    figsize=(6,4), 
    colormap='Set2'
)
plt.title("Gender Distribution per Binary Class")
plt.ylabel("Count")
plt.show()

# Gender distribution per multiclass
gender_multi = pd.crosstab(df['multiclass_label'], df['Gender'])
print("\nGender distribution per multiclass category:\n", gender_multi)

# Plot
gender_multi.plot(
    kind='bar', 
    stacked=True, 
    figsize=(8,5), 
    colormap='Set3'
)
plt.title("Gender Distribution per Multiclass Category")
plt.ylabel("Count")
plt.show()

# Encoding Binary Variables

In [None]:
# Encode Gender
df['Gender_encoded'] = LabelEncoder().fit_transform(df['Gender'])

# Encode the string labels into numeric
le = LabelEncoder()
df['binary_label_encoded'] = le.fit_transform(df['binary_label'])

# Check mapping
print(dict(zip(le.classes_, le.transform(le.classes_))))

# Regression Baseline (Predict BMI)

In [None]:
reg_features = ['MUAC', 'HC', 'Age', 'Gender_encoded']
reg_target = 'BMI'

X_reg = df[reg_features]
y_reg = df[reg_target]

# Handle missing values by filling with mean value
imputer = SimpleImputer(strategy='mean')
X_reg_imputed = imputer.fit_transform(X_reg)

# Train-test split
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg_imputed, y_reg, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_r_scaled = scaler.fit_transform(X_train_r)
X_test_r_scaled = scaler.transform(X_test_r)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_r_scaled, y_train_r)

# Predictions
y_pred_r = lr_model.predict(X_test_r_scaled)

# Metrics
print("Linear Regression R^2 Score:", r2_score(y_test_r, y_pred_r))
print("Linear Regression RMSE:", root_mean_squared_error(y_test_r, y_pred_r))

# Random Forest Regression
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_r, y_train_r)
y_pred_rf = rf_reg.predict(X_test_r)
print("Random Forest R^2 Score:", r2_score(y_test_r, y_pred_rf))
print("Random Forest RMSE:", root_mean_squared_error(y_test_r, y_pred_rf))

# Classification Baseline (Malnutrition binary)

In [None]:

# Features and target
clf_features = ['Height', 'Weight', 'MUAC', 'HC', 'Age', 'Gender_encoded']
clf_target = 'binary_label'

X_clf = df[clf_features]
y_clf = df[clf_target]

# Handle missing values by filling with mean value
imputer = SimpleImputer(strategy='mean')
X_clf_imputed = imputer.fit_transform(X_clf)

# Train-test split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clf_imputed, y_clf, test_size=0.2, random_state=42
)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train, predict, evaluate
for name, model in models.items():
    print(f"\n======== {name} ========")
    model.fit(X_train_c, y_train_c)
    y_pred = model.predict(X_test_c)
    
    acc = accuracy_score(y_test_c, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("\nClassification Report:\n", classification_report(y_test_c, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test_c, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# Feature Importance : 
tree_models = ['Random Forest', 'Gradient Boosting']
for name in tree_models:
    model = models[name]
    importances = model.feature_importances_

    # Create a DataFrame with feature names and importance values
    feat_df = pd.DataFrame({
        'Feature': clf_features,
        'Importance': importances
    })
    
    # Sort descending
    feat_df = feat_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    
    print(f"\nFeature Importances - {name}:\n")
    print(feat_df)

    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(8,5))
    plt.title(f"Feature Importance - {name}")
    plt.bar(range(len(importances)), importances[indices], color='skyblue')
    plt.xticks(range(len(importances)), [clf_features[i] for i in indices], rotation=45)
    plt.ylabel("Importance")
    plt.show()

# Multiclass Classification Baseline

In [None]:
# Features and target
clf_features = ['Height', 'Weight', 'MUAC', 'HC', 'Age', 'Gender_encoded']
clf_target = 'multiclass_label'

X_clf = df[clf_features]

# Handle missing values by replacing with mean
imputer = SimpleImputer(strategy='mean')
X_clf_imputed = imputer.fit_transform(X_clf)

# Encode multiclass target
le_multi = LabelEncoder()
y_clf_encoded = le_multi.fit_transform(df[clf_target])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_clf_imputed, y_clf_encoded, test_size=0.2, random_state=42
)

# Scale the features as they work better in case of Logistic regresion: 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train, predict, evaluate
for name, model in models.items():
    print(f"\n==== {name} ====")

    # Scale input only for Logistic Regression
    if name == "Logistic Regression":
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le_multi.classes_))

# Feature Importance : 
tree_models = ['Random Forest', 'Gradient Boosting']
for name in tree_models:
    model = models[name]
    importances = model.feature_importances_

    # Create a DataFrame with feature names and importance values
    feat_df = pd.DataFrame({
        'Feature': clf_features,
        'Importance': importances
    })
    
    # Sort descending
    feat_df = feat_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    
    print(f"\nFeature Importances - {name}:\n")
    print(feat_df)

    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(8,5))
    plt.title(f"Feature Importance - {name}")
    plt.bar(range(len(importances)), importances[indices], color='skyblue')
    plt.xticks(range(len(importances)), [clf_features[i] for i in indices], rotation=45)
    plt.ylabel("Importance")
    plt.show()