In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import torch

# Load dataset
dataset_path = "/kaggle/input/skindataset/cleaned_skindiseasesdataset.csv"
model_save_path = "/kaggle/working/skin_disease_classifier"
df = pd.read_csv(dataset_path, encoding="ISO-8859-1")

# Data preprocessing
df = df.rename(columns={"Input": "text", "Output": "label"})
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Convert the entire dataset to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_data(example):
    return tokenizer(example['text'], truncation=True, padding="max_length", max_length=128)

# Tokenize dataset
tokenized_dataset = dataset.map(preprocess_data, batched=True, num_proc=4)

# Load model
num_labels = len(df['label'].unique())
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=num_labels
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    logging_steps=50,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=200,  # Train for a fixed number of epochs
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,  # Enable mixed precision training
    learning_rate=2e-5,
    warmup_steps=500,
    lr_scheduler_type="cosine",
    report_to="none",
    dataloader_num_workers=4,
    seed=42,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Use the entire dataset for training
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model trained and saved at: {model_save_path}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/483 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  self.pid = os.fork()


Step,Training Loss
50,2.6433
100,2.6028
150,2.496
200,2.242
250,1.8475
300,1.3661
350,0.9329
400,0.5403
450,0.2379
500,0.0881


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Model trained and saved at: /kaggle/working/skin_disease_classifier


In [21]:
# Load the saved tokenizer and model
from transformers import pipeline

model_path = "/kaggle/working/skin_disease_classifier"

tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Create a pipeline for text classification
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Test the model on user input
def test_model():
    while True:
        user_input = input("Enter a symptom description (or type 'exit' to quit): ")
        if user_input.lower() == "exit":
            print("Exiting...")
            break
        
        # Get prediction
        prediction = classifier(user_input)
        label_id = int(prediction[0]["label"].split("_")[-1])  # Extract label ID
        predicted_label = label_encoder.inverse_transform([label_id])[0]
        
        print(f"Predicted Skin Condition: {predicted_label}")
        print(f"Confidence Score: {prediction[0]['score']:.2f}\n")

# Start testing
test_model()


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Enter a symptom description (or type 'exit' to quit):  "I’ve never had anything like this before; my skin is covered in welts."


Predicted Skin Condition: Hives (Urticaria)
Confidence Score: 1.00



Enter a symptom description (or type 'exit' to quit):  "Wearing shoes is painful because of the irritation on my feet."


Predicted Skin Condition: Athlete's Foot (Tinea Pedis)
Confidence Score: 1.00



Enter a symptom description (or type 'exit' to quit):  "I experienced a rash on my feet after wearing new shoes that irritated my skin."


Predicted Skin Condition: Contact Dermatitis
Confidence Score: 1.00



Enter a symptom description (or type 'exit' to quit):  "My face breaks out so easily, but it's not like typical acne. I have these red, bumpy patches that get worse when I'm stressed or hot. My friends don't understand, and I'm feeling really self-conscious about my skin."'


Predicted Skin Condition: Rosacea
Confidence Score: 1.00



Enter a symptom description (or type 'exit' to quit):  reddish sores, often around the nose and mouth.


Predicted Skin Condition: Contact Dermatitis
Confidence Score: 0.32



Enter a symptom description (or type 'exit' to quit):  I have  red sores or blisters, but the redness may be harder to see on brown and black skin. The sores or blisters quickly burst


Predicted Skin Condition: Impetigo
Confidence Score: 0.48



Enter a symptom description (or type 'exit' to quit):  exit


Exiting...


In [76]:
import pandas as pd

# Load the dataset
file_path = "/kaggle/input/skindataset/cleaned_skindiseasesdataset.csv"  # Update with the path to your dataset
df = pd.read_csv(file_path, encoding="ISO-8859-1")

# Check the column names to ensure correct loading
print("Dataset Columns:", df.columns)

# Extract unique disease names
unique_diseases = df['Output'].unique()  # Replace 'Output' with the correct column name if different
print("Unique Diseases:")
for idx, disease in enumerate(unique_diseases):
    print(f"{idx}: {disease}")

# Save the mapping (optional)
mapping = {idx: disease for idx, disease in enumerate(unique_diseases)}
print("\nMapping:", mapping)


Dataset Columns: Index(['Output', 'Input'], dtype='object')
Unique Diseases:
0: Vitiligo
1: Scabies
2: Hives (Urticaria)
3: Folliculitis
4: Ringworm (Tinea Corporis)
5: Athlete's Foot (Tinea Pedis)
6: Rosacea
7: Psoriasis
8: Shingles
9: Contact Dermatitis
10: Acne
11: Eczema
12: Shingles (Herpes Zoster)
13: Impetigo

Mapping: {0: 'Vitiligo', 1: 'Scabies', 2: 'Hives (Urticaria)', 3: 'Folliculitis', 4: 'Ringworm (Tinea Corporis)', 5: "Athlete's Foot (Tinea Pedis)", 6: 'Rosacea', 7: 'Psoriasis', 8: 'Shingles', 9: 'Contact Dermatitis', 10: 'Acne', 11: 'Eczema', 12: 'Shingles (Herpes Zoster)', 13: 'Impetigo'}


In [1]:
import os
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch

# Path to the folder containing the model weights and other files
model_folder = '/kaggle/input/skin-disease-classifier/'  # Path to your folder with model files

# Load tokenizer (from the folder containing tokenizer config and vocab)
distilBert_tokenizer = DistilBertTokenizer.from_pretrained(model_folder)

# Load model configuration and model weights (from the folder containing model.safetensors)
distilBert_model = DistilBertForSequenceClassification.from_pretrained(model_folder)

# Confirm the model is loaded successfully
print("Model and tokenizer loaded successfully.")


Model and tokenizer loaded successfully.


In [2]:
!pip install transformers torch accelerate

import os
os.environ["HF_TOKEN"] = "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"

HF_TOKEN = os.environ["HF_TOKEN"]
print(HF_TOKEN)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model = "meta-llama/Llama-2-7b-chat-hf" # meta-llama/Llama-2-7b-chat-hf

tokenizer = AutoTokenizer.from_pretrained(model, token=HF_TOKEN)
llama_model = AutoModelForCausalLM.from_pretrained(model)

from transformers import pipeline

llama_pipeline = pipeline(
    "text-generation",  # LLM task
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)



SYSTEM_PROMPT = """<s>[INST] <<SYS>>
You are a helpful bot. Your answers are clear and concise.
<</SYS>>

"""

# Formatting function for message and history
def format_message(message: str, history: list, memory_limit: int = 3) -> str:
    """
    Formats the message and history for the Llama model.

    Parameters:
        message (str): Current message to send.
        history (list): Past conversation history.
        memory_limit (int): Limit on how many past interactions to consider.

    Returns:
        str: Formatted message string
    """
    # always keep len(history) <= memory_limit
    if len(history) > memory_limit:
        history = history[-memory_limit:]

    if len(history) == 0:
        return SYSTEM_PROMPT + f"{message} [/INST]"

    formatted_message = SYSTEM_PROMPT + f"{history[0][0]} [/INST] {history[0][1]} </s>"

    # Handle conversation history
    for user_msg, model_answer in history[1:]:
        formatted_message += f"<s>[INST] {user_msg} [/INST] {model_answer} </s>"

    # Handle the current message
    formatted_message += f"<s>[INST] {message} [/INST]"

    return formatted_message



# Generate a response from the Llama model
def get_llama_response(message: str, history: list) -> str:
    """
    Generates a conversational response from the Llama model.

    Parameters:
        message (str): User's input message.
        history (list): Past conversation history.

    Returns:
        str: Generated response from the Llama model.
    """
    query = format_message(message, history)
    response = ""

    sequences = llama_pipeline(
        query,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=1024,
    )

    generated_text = sequences[0]['generated_text']
    response = generated_text[len(query):]  # Remove the prompt from the output

    # print("Chatbot:", response.strip())
    return response.strip()

print("Running")
response = get_llama_response("Hello", [])
print("Here I am ")
print("ChatbotResponse:   ", response)

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


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Running
Here I am 
ChatbotResponse:    Hello! *smiling* It's nice to meet you. Is there something I can help you with or would you like to chat?


In [4]:
!pip install flask flask-ngrok transformers torch




In [8]:
!pip install flask flask-ngrok flask-cors transformers torch





In [26]:
import sys

# Formatting function for message and history
SYSTEM_PROMPT = """<s>[INST] <<SYS>>
You are a helpful bot. Your answers are clear and concise.
<</SYS>>

"""

def format_message_with_prediction(message: str, disease_prediction: str, history: list, memory_limit: int = 3) -> str:
    """
    Formats the message and history for the Llama model with a disease prediction.

    Parameters:
        message (str): Current message to send.
        disease_prediction (str): Disease predicted by DistilBERT.
        history (list): Past conversation history.
        memory_limit (int): Limit on how many past interactions to consider.

    Returns:
        str: Formatted message string.
    """
    # Limit history to memory limit
    if len(history) > memory_limit:
        history = history[-memory_limit:]

    # Start with the system prompt
    formatted_message = SYSTEM_PROMPT

    # Append conversation history
    for user_msg, model_answer in history:
        formatted_message += f"<s>[INST] {user_msg} [/INST] {model_answer} </s>"

    # Add the current message with the disease prediction
    formatted_message += f"<s>[INST] Based on the symptoms you told me, you may have {disease_prediction}. {message} [/INST]"

    return formatted_message


def predict_disease(input_text: str) -> str:
    """
    Predicts the disease using the DistilBERT model.

    Parameters:
        input_text (str): The user's input describing symptoms.

    Returns:
        str: Predicted disease.
    """
    disease_mapping = {
        0: 'Vitiligo',
        1: 'Scabies',
        2: 'Hives (Urticaria)',
        3: 'Folliculitis',
        4: 'Ringworm (Tinea Corporis)',
        5: "Athlete's Foot (Tinea Pedis)",
        6: 'Rosacea',
        7: 'Psoriasis',
        8: 'Shingles',
        9: 'Contact Dermatitis',
        10: 'Acne',
        11: 'Eczema',
        12: 'Shingles (Herpes Zoster)',
        13: 'Impetigo'
    }

    inputs = distilBert_tokenizer(input_text, return_tensors="pt")
    outputs = distilBert_model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    disease_label = disease_mapping.get(predicted_class, "Unknown Disease")
    return disease_label


def get_combined_response(message: str, history: list) -> str:
    """
    Combines DistilBERT and LLaMA2 to generate a response.

    Parameters:
        message (str): User's input message.
        history (list): Past conversation history.

    Returns:
        str: Generated response from the LLaMA2 model.
    """
    # Step 1: Predict disease using DistilBERT
    predicted_disease = predict_disease(message)

    # Step 2: Format the input for LLaMA2
    formatted_message = format_message_with_prediction(message, predicted_disease, history)

    # Step 3: Generate response using LLaMA2
    sequences = llama_pipeline(
        formatted_message,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=1024,
    )

    # Extract generated text
    generated_text = sequences[0]['generated_text']
    response = generated_text[len(formatted_message):].strip()

    return response


def chat():
    """
    Provides a continuous chat interface for the user.
    The chat ends when the user types "exit".
    """
    print("Chatbot is running. Type your message below (type 'exit' to quit):")
    history = []

    while True:
        user_message = input("You: ").strip()
        if user_message.lower() == "exit":
            print("Chatbot: Goodbye! Take care!")
            break

        response = get_combined_response(user_message, history)
        history.append((user_message, response))
        print(f"Chatbot: {response}")


# Start the chat interface
if __name__ == "__main__":
    chat()


Chatbot is running. Type your message below (type 'exit' to quit):


You:  I ahve dry and itchy scaly skin


Chatbot: I see, thank you for sharing that with me! Based on your symptoms, it's possible that you may have acne. Acne can cause dry, itchy, and scaly skin, especially on the face, chest, and back. It's important to keep in mind that acne can be caused by a variety of factors, including hormonal changes, genetics, and environmental factors like humidity and stress.

If you think you may have acne, there are several things you can try to help manage your symptoms:

1. Keep your skin clean: Wash your face twice a day with a gentle cleanser to remove dirt and oil that can clog pores.
2. Use over-the-counter acne treatments: Look for products containing benzoyl peroxide or salicylic acid, which can help kill bacteria and reduce inflammation.
3. Avoid picking or popping pimples: This can lead to further inflammation and scarring.
4. Try a spot treatment: Apply a spot treatment containing benzoyl peroxide or salicylic acid to individual pimples to help reduce inflammation and dry them out.
5

You:  exit


Chatbot: Goodbye! Take care!


In [27]:
# Your disease prediction and response generation logic
def predict_disease(input_text):
    # Disease prediction using DistilBERT
    inputs = distilBert_tokenizer(input_text, return_tensors="pt")
    outputs = distilBert_model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    disease_mapping = {
        0: 'Vitiligo',
        1: 'Scabies',
        2: 'Hives (Urticaria)',
        3: 'Folliculitis',
        4: 'Ringworm (Tinea Corporis)',
        5: "Athlete's Foot (Tinea Pedis)",
        6: 'Rosacea',
        7: 'Psoriasis',
        8: 'Shingles',
        9: 'Contact Dermatitis',
        10: 'Acne',
        11: 'Eczema',
        12: 'Shingles (Herpes Zoster)',
        13: 'Impetigo'
    }
    disease_label = disease_mapping.get(predicted_class, "Unknown Disease")
    return disease_label

def generate_response(message, history):
    # Disease prediction
    predicted_disease = predict_disease(message)

    # Formatting the message for LLaMA2
    formatted_message = f"Based on your symptoms, you may have {predicted_disease}. {message}"

    # Generate response using LLaMA2
    inputs = llama2_tokenizer(formatted_message, return_tensors="pt")
    response_ids = llama2_model.generate(inputs["input_ids"], max_length=100)
    response = llama2_tokenizer.decode(response_ids[0], skip_special_tokens=True)
    return response