In [None]:
!pip install scikit-learn matplotlib



In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Backend_old

Mounted at /content/drive
/content/drive/MyDrive/Backend_old


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Backend_old')

#Building the Symptom Classifier model

In [None]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pickle
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Importing custom modules

In [None]:
from nltk_utils import tokenize, stem, build_vocab, sentence_to_indices
from nnet import NeuralNet

Loading dataset

In [None]:
with open("multi_label_dataset.json", "r") as f:
    data = json.load(f)

Collecting all tags

In [None]:
all_tags = sorted(set(tag for item in data for tag in item['tags']))
tag2idx = {tag: idx for idx, tag in enumerate(all_tags)}

Building vocab


In [None]:
all_sentences = [item["sentence"] for item in data]
vocab, word2idx = build_vocab(all_sentences)

Defining Hyperparameters


In [None]:
max_len = 10
embed_size = 64
hidden_size = 32
output_size = len(all_tags)
vocab_size = len(word2idx)
batch_size = 8
num_epochs = 1000
learning_rate = 0.001

Preparing training data

In [None]:
X_train, y_train = [], []

for item in data:
    indices = sentence_to_indices(item["sentence"], word2idx, max_len)
    label_vector = [0] * output_size
    for tag in item["tags"]:
        label_vector[tag2idx[tag]] = 1
    X_train.append(indices)
    y_train.append(label_vector)

X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)

class MultiLabelDataset(Dataset):
    def __init__(self):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.n_samples

dataset = MultiLabelDataset()
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

  X_train = torch.tensor(X_train, dtype=torch.long)


Model and training setup

In [None]:
model = NeuralNet(vocab_size, embed_size, hidden_size, output_size, max_len)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
print("Training multi-label model...")
for epoch in range(num_epochs):
    for words, labels in train_loader:
        outputs = model(words)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

print("✅ Training complete.")

Saving the model and metadata

In [None]:
model_data = {
    "model_state": model.state_dict(),
    "input_size": max_len,
    "hidden_size": hidden_size,
    "output_size": output_size,
    "vocab_size": vocab_size,
    "tags": all_tags,
    "word2idx": word2idx
}

torch.save(model_data, "models/multi_label_model.pth")
with open("models/multi_word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)

print("✅ Multi-label model and vocab saved.")

#Building the Disease Classifier Model


model = NeuralNet(vocab_size, embed_size, hidden_size, output_size, max_len)
criterion = nn.BCEWithLogitsLoss()  # multi-label loss
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load and inspect the dataset
df = pd.read_csv("Training.csv")
print("First 5 rows of data:")
print(df.head())

First 5 rows of data:
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
0       0           0             0        0                 0  ...         0   
1       0           0             0        0                 0  ...         0   
2       0           0             0        0                 0  ...         0   
3       0           0             0        0                 0  ...         0   
4       0           0             0        0                 0  ...         0   

   skin_peeling 

In [None]:
# Remove unnecessary unnamed columns
df.drop(columns=[col for col in df.columns if 'Unnamed' in col], inplace=True)

In [None]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64


In [None]:
# Features (symptoms) and labels (disease predictions)
X = df.drop("prognosis", axis=1).astype(int)  # Input features
y = df["prognosis"]  # Output labels

In [None]:
# Encoding disease names into numerical values
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [None]:
# Saving label encoder for later use in the backend
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [None]:
#Saving list of symptoms for use in backend
with open("list_of_symptoms.pickle", "wb") as f:
    pickle.dump(X.columns.tolist(), f)

In [None]:
# Splitting dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

In [None]:
# Function to perform cross-validation and find best parameters
def cross_validation(X_train, y_train, X_test, y_test, model_name, parameter_range=15):
    train_errors, test_errors = [], []
    parameters = np.arange(1, parameter_range + 1)

    for parameter in parameters:
        if model_name == 'knn':
            model = KNeighborsClassifier(n_neighbors=parameter)
        elif model_name == 'logreg':
            model = LogisticRegression(solver='liblinear', C=1/(parameter*10))
        elif model_name == 'dctree':
            model = DecisionTreeClassifier(splitter='random', max_depth=parameter)
        elif model_name == 'svm':
            model = SVC(C=1/(parameter*5))

        model.fit(X_train, y_train)
        train_errors.append(1 - model.score(X_train, y_train))
        test_errors.append(1 - model.score(X_test, y_test))

    # Returning the best parameter
    if model_name == 'logreg':
        best_param = 1/(parameters[np.argmin(test_errors)]*10)
    elif model_name == 'svm':
        best_param = 1/(parameters[np.argmin(test_errors)]*5)
    else:
        best_param = parameters[np.argmin(test_errors)]

    return parameters, best_param, train_errors, test_errors

In [None]:
# Finding best hyperparameters for each model
best_params = {}
for model_name in ['knn', 'dctree', 'logreg', 'svm']:
    print(f"🔍 Tuning {model_name}...")
    _, best, _, _ = cross_validation(X_train, y_train, X_test, y_test, model_name)
    best_params[model_name] = best
    print(f"✅ Best {model_name} parameter: {best}\n")

🔍 Tuning knn...
✅ Best knn parameter: 1

🔍 Tuning dctree...
✅ Best dctree parameter: 15

🔍 Tuning logreg...
✅ Best logreg parameter: 0.1

🔍 Tuning svm...
✅ Best svm parameter: 0.2



In [None]:
# Defining the ensemble using stacking with the best models
base_models = [
    ('lr', LogisticRegression(solver='liblinear', C=best_params['logreg'])),
    ('knn', KNeighborsClassifier(n_neighbors=best_params['knn'])),
    ('dctree', DecisionTreeClassifier(splitter='random', max_depth=best_params['dctree'])),
    ('svm', SVC(C=best_params['svm'], probability=True))
]

In [None]:
# Final estimator is Logistic Regression
ensemble = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression(), cv=5)

In [None]:
# Using cross-validation to evaluate ensemble performance
cv = RepeatedStratifiedKFold(n_repeats=3, random_state=1)
score = cross_val_score(ensemble, X, y_encoded, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# Training the ensemble on the full dataset and saving it
ensemble.fit(X, y_encoded)


📊 Final Ensemble Accuracy: 1.0000 ± 0.0000


In [None]:
individual_models = ensemble.named_estimators_

individual_models = ensemble.named_estimators_

for model_name, model in individual_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy of {model_name}: {accuracy:.4f}")

Accuracy of lr: 1.0000
Accuracy of knn: 1.0000
Accuracy of dctree: 0.4339
Accuracy of svm: 1.0000


In [None]:
with open("fitted_model_stacked_final.pkl", "wb") as f:
    pickle.dump(ensemble, f)

print(f"\n📊 Final Ensemble Accuracy: {np.mean(score):.4f} ± {np.std(score):.4f}")


📊 Final Ensemble Accuracy: 1.0000 ± 0.0000
