# Exploration

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
patients_data = pd.read_csv('data.csv', decimal=',')
patients_data['Sedentary_hours_daily'] = patients_data['Sedentary_hours_daily'].astype(float)
patients_data['Regular_fiber_diet'] = patients_data['Regular_fiber_diet'].astype(float)
patients_data['Age'] = patients_data['Age'].astype(float)
patients_data['Est_avg_calorie_intake'] = patients_data['Est_avg_calorie_intake'].astype(int)
patients_data['Main_meals_daily'] = patients_data['Main_meals_daily'].astype(int)
patients_data['Height'] = patients_data['Height'].astype(float)
patients_data['Water_daily'] = patients_data['Water_daily'].astype(int)
patients_data['Weight'] = patients_data['Weight'].astype(float)
patients_data['Physical_activity_level'] = patients_data['Physical_activity_level'].astype(int)
patients_data['Technology_time_use'] = patients_data['Technology_time_use'].astype(int)

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Diagnostic', data=patients_data)
plt.title('Class distribution in train set patients')
plt.show()

In [None]:
ptbdb_abnormal = pd.read_csv('ptbdb_abnormal.csv', header=None)
ptbdb_normal = pd.read_csv('ptbdb_normal.csv', header=None)

ptbdb_abnormal['label'] = 1
ptbdb_normal['label'] = 0
ptbdb = pd.concat([ptbdb_normal, ptbdb_abnormal], axis=0).reset_index(drop=True)

In [None]:
plt.figure(figsize=(12, 6))

normal_example = ptbdb[ptbdb['label'] == 0].iloc[0, :-1]
plt.plot(normal_example, label='Normal')
plt.title('Normal series example')
plt.legend()
plt.show()

anormal_example = ptbdb[ptbdb['label'] == 1].iloc[0, :-1]
plt.plot(anormal_example, label='Anormal')
plt.title('Anormal series example')
plt.legend()
plt.show()

In [None]:
mean_normal = ptbdb[ptbdb['label'] == 0].iloc[:, :-1].mean()
std_normal = ptbdb[ptbdb['label'] == 0].iloc[:, :-1].std()

mean_anormal = ptbdb[ptbdb['label'] == 1].iloc[:, :-1].mean()
std_anormal = ptbdb[ptbdb['label'] == 1].iloc[:, :-1].std()

plt.figure(figsize=(12, 6))
plt.plot(mean_normal, label='Normal mean')
plt.fill_between(range(len(mean_normal)), mean_normal - std_normal, mean_normal + std_normal, alpha=0.2)
plt.title('Normal mean and standard deviation')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(mean_anormal, label='Anormal mean')
plt.fill_between(range(len(mean_anormal)), mean_anormal - std_anormal, mean_anormal + std_anormal, alpha=0.2)
plt.title('Anormal mean and standard deviation')
plt.legend()
plt.show()

# Train

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
numeric_attributes = patients_data[['Sedentary_hours_daily', 'Regular_fiber_diet', 'Age', 'Est_avg_calorie_intake', 'Main_meals_daily',\
                      'Height', 'Water_daily', 'Weight', 'Physical_activity_level', 'Technology_time_use']]
categorical_attributes = patients_data[['Transportation', 'Diagnostic_in_family_history',\
                          'High_calorie_diet', 'Alcohol', 'Snacks', 'Smoker', 'Calorie_monitoring', 'Gender']]

# encode categorical attributes
encoder = LabelEncoder()
for column in categorical_attributes.columns:
    patients_data[column] = encoder.fit_transform(patients_data[column])
patients_data['Diagnostic'] = encoder.fit_transform(patients_data['Diagnostic'])

# remove outliers
z_scores = numeric_attributes.apply(zscore)
numeric_attributes[z_scores.abs() > 3] = np.nan

# treat missing values
imputer = IterativeImputer(missing_values=-1, max_iter=10)
patients_data['Weight'] = imputer.fit_transform(patients_data[['Weight']])
for column in numeric_attributes:
    patients_data[column] = imputer.fit_transform(patients_data[[column]])

# standardize data
scaler = StandardScaler()
patients_data[numeric_attributes.columns] = scaler.fit_transform(patients_data[numeric_attributes.columns])

X_patients_train, X_patients_test, Y_patients_train, Y_patients_test = train_test_split(patients_data.drop('Diagnostic', axis=1), patients_data['Diagnostic'], test_size=0.2, random_state=42)

In [None]:
X_ptbdb = ptbdb.iloc[:, :-1].values
Y_ptbdb = ptbdb.iloc[:, -1].values.astype(int)
X_ptbdb_train, X_ptbdb_test, Y_ptbdb_train, Y_ptbdb_test = train_test_split(X_ptbdb, Y_ptbdb, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_ptbdb_train = scaler.fit_transform(X_ptbdb_train)
X_ptbdb_test = scaler.transform(X_ptbdb_test)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.hidden1 = nn.Linear(input_size, 128)
        self.hidden2 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 5)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.relu(self.hidden1(x))
        x = self.dropout(x)
        x = self.relu(self.hidden2(x))
        x = self.output(x)
        return x
    
class CNN1D(nn.Module):
    def __init__(self):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=5, padding=2)
        self.conv3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=5, padding=2)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.pool(x).squeeze(-1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x
    
def train(num_epochs, model, train_loader, criterion, optimizer):
    for epoch in range(num_epochs):
        model.train()
        for X_batch, Y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, Y_batch)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

def eval(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, Y_batch in test_loader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs.data, 1)
            total += Y_batch.size(0)
            correct += (predicted == Y_batch).sum().item()
    print(f'Accuracy: {100 * correct / total:.2f}%')

In [None]:
X_train_tensor = torch.tensor(X_patients_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_patients_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_patients_test, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_patients_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model = MLP(input_size=X_patients_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(10, model, train_loader, criterion, optimizer)
eval(model, test_loader)

In [None]:
X_train_tensor = torch.tensor(X_ptbdb_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_ptbdb_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_ptbdb_test, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_ptbdb_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model = MLP(input_size=X_ptbdb_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(10, model, train_loader, criterion, optimizer)
eval(model, test_loader)

In [None]:
X_train_tensor = torch.tensor(X_ptbdb_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_ptbdb_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_ptbdb_test, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_ptbdb_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model = CNN1D()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(10, model, train_loader, criterion, optimizer)
eval(model, test_loader)