In [247]:
# https://platform.olimpiada-ai.ro/problems/54

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [248]:
train = pd.read_csv("/kaggle/input/disease-classification/train.csv")
test = pd.read_csv("/kaggle/input/disease-classification/test.csv")

train.shape, test.shape

((800, 6), (200, 5))

In [249]:
symptoms = set()

for ls in train['Symptoms'].map(lambda x: [s.strip() for s in x.split(',')]):
    for s in ls:
        symptoms.add(s)

symptoms = list(symptoms)
len(symptoms)

23

In [250]:
classes = train['Disease'].unique().tolist()

class2idx = {v: i for i, v in enumerate(classes)}
idx2class = {i: v for i, v in enumerate(classes)}

In [251]:
def process_df(df):
    if 'Disease' in df.columns:
        df['Disease'] = df['Disease'].map(class2idx.get)
    for s in symptoms:
        df[f'symptom_{s}'] = df['Symptoms'].map(lambda x: 1 if s in x else 0)
    dummies = pd.get_dummies(df['Gender'], columns=['Gender']).astype(int)
    df = pd.concat([df, dummies], axis=1)
    return df

train = process_df(train)
test = process_df(test)

In [252]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = ['Age', 'Female', 'Male', 'Other', 'Symptom_Count'] + [f'symptom_{s}' for s in symptoms]
target_col = ['Disease']

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

train_pool = Pool(X_train, y_train)
valid_pool = Pool(X_valid, y_valid)
full_pool = Pool(X, y)

In [253]:
# from catboost import CatBoostClassifier

# params = {
#     'iterations': 100,
#     'loss_function': 'MultiClass',
#     'eval_metric': 'Accuracy',
#     'metric_period': 10,
#     'max_depth': 2
# }

# model = CatBoostClassifier(**params)

# model.fit(train_pool, eval_set=valid_pool)

In [254]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(max_iter=90, random_state=42, hidden_layer_sizes=(256, ))

model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [255]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_valid)
score = accuracy_score(y_pred, y_valid)

print(f'Score: {score:.5f}')

Score: 0.90625


In [256]:
model = MLPClassifier(max_iter=90, random_state=42, hidden_layer_sizes=(256, ))

model.fit(X, y)

  y = column_or_1d(y, warn=True)


In [257]:
y_pred = model.predict(X_test).flatten().tolist()
y_pred = list(map(idx2class.get, y_pred))

subm = pd.DataFrame({
    'Patient_ID': test['Patient_ID'],
    'Disease': y_pred
})

subm.to_csv("submission.csv", index=False)

subm.head()

Unnamed: 0,Patient_ID,Disease
0,T0001,Heart Disease
1,T0002,Influenza
2,T0003,Pneumonia
3,T0004,Migraine
4,T0005,Diabetes
