In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
import torch
from torch import nn

In [2]:
data = pd.read_csv('AIDS_Classification_50000.csv')
must_be_deleted = ['time' , 'strat' , 'str2' , 'race' ,'gender' , 'offtrt' , 'preanti' , 'trt' ]
data = data.drop(must_be_deleted, axis=1)
data.fillna(data.mean(), inplace=True)
data.fillna(data.mode().iloc[0], inplace=True)

In [3]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

In [4]:
numerical_columns = ['age', 'wtkg', 'hemo', 'cd40', 'cd420', 'cd80', 'cd820']
data = remove_outliers_iqr(data, numerical_columns)
data = pd.get_dummies(data, columns=['symptom', 'treat'], drop_first=True)
X = data.drop(columns=['infected'])
y = data['infected']

In [5]:
from sklearn.feature_selection import VarianceThreshold
var_thresh = VarianceThreshold(threshold=0.01)
X = var_thresh.fit_transform(X)

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [7]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1441)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=1441)

In [9]:
class HIVDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.to_numpy(), dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [10]:
class CNN1D(torch.nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CNN1D, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.MaxPool1d(kernel_size=2),

            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.MaxPool1d(kernel_size=2),

            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.MaxPool1d(kernel_size=2)
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(64 * (input_dim // 8), 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1) 
        x = self.fc_layers(x)
        return x

In [11]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0

        for inputs, labels in dataloaders['train']:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            correct_preds += torch.sum(preds == labels.data)
            total_preds += labels.size(0)

        epoch_loss = running_loss / len(dataloaders['train'].dataset)
        epoch_acc = correct_preds.double() / total_preds

        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f} - Accuracy: {epoch_acc:.4f}")

        model.eval()
        val_running_loss = 0.0
        val_correct_preds = 0
        val_total_preds = 0

        with torch.no_grad():
            for inputs, labels in dataloaders['val']:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_running_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct_preds += torch.sum(preds == labels.data)
                val_total_preds += labels.size(0)

        val_loss = val_running_loss / len(dataloaders['val'].dataset)
        val_acc = val_correct_preds.double() / val_total_preds

        print(f"Validation - Loss: {val_loss:.4f} - Accuracy: {val_acc:.4f}")

In [12]:
train_dataset = HIVDataset(X_train, y_train)
val_dataset = HIVDataset(X_test, y_test)

In [13]:
dataloaders = {
    'train': DataLoader(train_dataset, batch_size=32, shuffle=True),
    'val': DataLoader(val_dataset, batch_size=32, shuffle=False)
}

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN1D(input_dim=X_train.shape[1], num_classes=len(np.unique(y_train))).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [32]:
train_model(model, dataloaders, criterion, optimizer, num_epochs=500)

Epoch 1/500 - Loss: 0.1419 - Accuracy: 0.9371
Validation - Loss: 0.4849 - Accuracy: 0.8909
Epoch 2/500 - Loss: 0.1364 - Accuracy: 0.9391
Validation - Loss: 0.5062 - Accuracy: 0.8907
Epoch 3/500 - Loss: 0.1377 - Accuracy: 0.9391
Validation - Loss: 0.4743 - Accuracy: 0.8930
Epoch 4/500 - Loss: 0.1439 - Accuracy: 0.9368
Validation - Loss: 0.5265 - Accuracy: 0.8826
Epoch 5/500 - Loss: 0.1407 - Accuracy: 0.9386
Validation - Loss: 0.4997 - Accuracy: 0.8931
Epoch 6/500 - Loss: 0.1404 - Accuracy: 0.9400
Validation - Loss: 0.5063 - Accuracy: 0.8898
Epoch 7/500 - Loss: 0.1421 - Accuracy: 0.9367
Validation - Loss: 0.4818 - Accuracy: 0.8893
Epoch 8/500 - Loss: 0.1421 - Accuracy: 0.9368
Validation - Loss: 0.4807 - Accuracy: 0.8911
Epoch 9/500 - Loss: 0.1431 - Accuracy: 0.9383
Validation - Loss: 0.4848 - Accuracy: 0.8880
Epoch 10/500 - Loss: 0.1414 - Accuracy: 0.9396
Validation - Loss: 0.4667 - Accuracy: 0.8891
Epoch 11/500 - Loss: 0.1387 - Accuracy: 0.9379
Validation - Loss: 0.4727 - Accuracy: 0.89

In [33]:
torch.save(model, 'model.pth')

In [34]:
import json

config = {
    'batch_size': 32,
    'learning_rate': 0.001,
    'num_epochs': 50,
    'input_size': X_train.shape[1],
    'output_size': len(y_train.unique())
}

with open('config.json', 'w') as f:
    json.dump(config, f)

In [35]:
import joblib

joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [36]:
class_labels = {0: 'Negative', 1: 'Positive'}
with open('class_labels.json', 'w') as f:
    json.dump(class_labels, f)