In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import necessary libraries
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Load your dataset from a CSV file located in your Google Drive
file_path = '/content/drive/MyDrive/skindiseases.csv'  # Adjust the path if necessary
df = pd.read_csv(file_path, delimiter=',', on_bad_lines='skip')

# Check the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(df.head())

# Step 1: Label encode the disease names
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Disease name'])  # Ensure 'Disease name' is the correct column name

# Step 2: Ensure the class weights include all the labels in y_train
unique_classes = np.unique(df['label'])

# Step 3: Calculate class weights for all valid labels in the dataset
class_weights = compute_class_weight('balanced', classes=unique_classes, y=df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Print the calculated class weights
print("Class weights:", class_weights)


First few rows of the DataFrame:
        Disease name                                               Text
0           Vitiligo  "I've had these light patches on my neck and f...
1           Vitiligo                                 "I've patchy skin"
2            Scabies  "Doctor, I've noticed these small, red bumps o...
3           Vitiligo  "Doctor, I noticed a pale patch around my knee...
4  Hives (Urticaria)  Hives, also known as urticaria, typically pres...
Class weights: tensor([0.7635, 0.8589, 0.7012, 1.0105, 0.8380, 0.9286, 2.0210, 1.1083, 0.8380,
        1.2725, 0.8380, 1.5617, 3.1234, 0.7635])


In [None]:
# Install necessary libraries
!pip install transformers torch




In [None]:
# Import necessary libraries
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW

# Load your dataset from a CSV file located in your Google Drive
file_path = '/content/drive/MyDrive/skindiseases.csv'  # Adjust the path if necessary
df = pd.read_csv(file_path, delimiter=',', on_bad_lines='skip')

# Check the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(df.head())

# Check the columns to ensure 'Disease name' exists
print("Columns in DataFrame:", df.columns)

# Step 1: Label encode the disease names
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Disease name'])  # Ensure 'Disease name' is the correct column name

# Check if the label column is created successfully
print("After label encoding:")
print(df.head())  # This will show the 'label' column

# Step 2: Calculate class weights
unique_classes = np.unique(df['label'])
class_weights = compute_class_weight('balanced', classes=unique_classes, y=df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Print the calculated class weights
print("Class weights:", class_weights)

# Define the learning rate
learning_rate = 5e-5  # You can adjust this value as needed

# Load pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the dataset (assuming 'Text' is your text column)
# Ensure 'Text' is the correct column name
encodings = tokenizer(
    df['Text'].tolist(),  # Ensure 'Text' is the correct column name
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    max_length=128,  # Set max sequence length
    padding='max_length',  # Pad to max length
    truncation=True,  # Truncate long sentences
    return_attention_mask=True,  # Generate attention masks
    return_tensors='pt'  # Return as PyTorch tensors
)

# Create dataset for DistilBERT
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(df['label'].values))

# Create DataLoader for batching
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Load pre-trained DistilBERT model for classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Check if everything is set up correctly
print("Model and DataLoader initialized successfully.")


First few rows of the DataFrame:
        Disease name                                               Text
0           Vitiligo  "I've had these light patches on my neck and f...
1           Vitiligo                                 "I've patchy skin"
2            Scabies  "Doctor, I've noticed these small, red bumps o...
3           Vitiligo  "Doctor, I noticed a pale patch around my knee...
4  Hives (Urticaria)  Hives, also known as urticaria, typically pres...
Columns in DataFrame: Index(['Disease name', 'Text'], dtype='object')
After label encoding:
        Disease name                                               Text  label
0           Vitiligo  "I've had these light patches on my neck and f...     13
1           Vitiligo                                 "I've patchy skin"     13
2            Scabies  "Doctor, I've noticed these small, red bumps o...     10
3           Vitiligo  "Doctor, I noticed a pale patch around my knee...     13
4  Hives (Urticaria)  Hives, also known as urtic

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and DataLoader initialized successfully.




In [None]:
# Save the model after training
model_save_path = '/content/drive/MyDrive/skin_text_classifier_model.pt'  # Save to Google Drive
optimizer_save_path = '/content/drive/MyDrive/optimizer_state.pt'  # Save to Google Drive


In [None]:
# Set the model to training mode
model.train()

# Freeze all layers except the final classification layer
for name, param in model.named_parameters():
    if 'classifier' not in name:  # Adjust based on the layer name in DistilBERT
        param.requires_grad = False

# Redefine the optimizer to only update the parameters of the classification layer
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

# Specify the number of epochs you want to train for
start_epoch = 1
num_epochs = 15  # Total epochs you want to run including the previous ones

# Continue training from the specified epoch
for epoch in range(start_epoch, num_epochs + 1):  # Adjust range as necessary
    for batch in train_loader:
        optimizer.zero_grad()  # Clear previous gradients
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights

    print(f"Epoch {epoch} completed. Loss: {loss.item()}")  # Print loss after each epoch

# Save the model after training
model_save_path = '/content/drive/MyDrive/skin_text_classifier_model.pt'  # Save to Google Drive
optimizer_save_path = '/content/drive/MyDrive/optimizer_state.pt'  # Save to Google Drive

torch.save(model.state_dict(), model_save_path)
torch.save(optimizer.state_dict(), optimizer_save_path)

print("Model and optimizer state saved successfully.")


Epoch 1 completed. Loss: 2.6759681701660156
Epoch 2 completed. Loss: 2.6977548599243164
Epoch 3 completed. Loss: 2.5063273906707764
Epoch 4 completed. Loss: 2.3700764179229736
Epoch 5 completed. Loss: 2.33984375
Epoch 6 completed. Loss: 2.242823839187622
Epoch 7 completed. Loss: 2.4188036918640137
Epoch 8 completed. Loss: 2.3527920246124268
Epoch 9 completed. Loss: 2.2914299964904785
Epoch 10 completed. Loss: 2.30540132522583
Epoch 11 completed. Loss: 2.1017820835113525
Epoch 12 completed. Loss: 2.4933996200561523
Epoch 13 completed. Loss: 2.5385289192199707
Epoch 14 completed. Loss: 2.6722412109375
Epoch 15 completed. Loss: 2.0987610816955566
Model and optimizer state saved successfully.


In [None]:
# Set the model to training mode
model.train()

# Freeze all layers except the final classification layer
for name, param in model.named_parameters():
    if 'classifier' not in name:  # Adjust based on the layer name in DistilBERT
        param.requires_grad = False

# Specify the starting epoch and the total number of epochs
start_epoch = 801  # Set this to the epoch you want to start from
num_epochs = 850   # Total number of epochs you want to run (set as needed)

# Continue training from the specified epoch
for epoch in range(start_epoch, num_epochs + 1):  # Adjust range as necessary
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for batch in train_loader:
        optimizer.zero_grad()  # Clear previous gradients
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_loss += loss.item()  # Accumulate loss

        # Calculate predictions and accuracy
        logits = outputs.logits  # Get logits for prediction
        predictions = torch.argmax(logits, dim=-1)  # Get predicted class indices

        total_correct += (predictions == labels).sum().item()  # Count correct predictions
        total_samples += labels.size(0)  # Count total samples

        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights

    # Calculate average loss and accuracy
    avg_loss = total_loss / len(train_loader)
    accuracy = total_correct / total_samples

    print(f"Epoch {epoch} completed. Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")  # Print loss and accuracy

    # Save the model and optimizer state after each epoch
    torch.save(model.state_dict(), model_save_path)
    torch.save(optimizer.state_dict(), optimizer_save_path)

print("Model and optimizer state saved successfully.")

Epoch 801 completed. Loss: 0.6802, Accuracy: 0.8129
Epoch 802 completed. Loss: 0.6584, Accuracy: 0.8170
Epoch 803 completed. Loss: 0.6749, Accuracy: 0.7942
Epoch 804 completed. Loss: 0.7644, Accuracy: 0.8274
Epoch 805 completed. Loss: 0.6866, Accuracy: 0.8274
Epoch 806 completed. Loss: 0.6969, Accuracy: 0.8108
Epoch 807 completed. Loss: 0.6912, Accuracy: 0.8170
Epoch 808 completed. Loss: 0.6963, Accuracy: 0.8129
Epoch 809 completed. Loss: 0.6827, Accuracy: 0.8046
Epoch 810 completed. Loss: 0.7065, Accuracy: 0.8004
Epoch 811 completed. Loss: 0.6694, Accuracy: 0.8108
Epoch 812 completed. Loss: 0.6615, Accuracy: 0.8129
Epoch 813 completed. Loss: 0.7070, Accuracy: 0.8004
Epoch 814 completed. Loss: 0.6763, Accuracy: 0.8316
Epoch 815 completed. Loss: 0.6834, Accuracy: 0.7983
Epoch 816 completed. Loss: 0.6450, Accuracy: 0.8337
Epoch 817 completed. Loss: 0.6645, Accuracy: 0.8087
Epoch 818 completed. Loss: 0.6796, Accuracy: 0.8129
Epoch 819 completed. Loss: 0.6675, Accuracy: 0.8025
Epoch 820 co

In [None]:
model.eval()  # Set the model to evaluation mode

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Function to predict the disease from user input
def predict_disease(user_input):
    # Tokenize the input
    inputs = tokenizer(
        user_input,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Make prediction
    with torch.no_grad():  # Disable gradient calculations
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
        logits = outputs.logits

    # Get predicted class
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_label = label_encoder.inverse_transform([predicted_class])  # Convert back to original label

    return predicted_label[0]  # Return the predicted disease name

# Get user input for testing
user_input = input("Please enter the text for prediction: ")
predicted_disease = predict_disease(user_input)

print(f"The predicted disease is: {predicted_disease}")



Please enter the text for prediction: I noticed my face gets flushed and red after intense workouts, especially in hot weather. It feels hot and tingly, and I'm worried it might be affecting my performance. The predicted disease is: Rosacea
The predicted disease is: Rosacea
