<a href="https://colab.research.google.com/github/buggytanmoy77/Neurothon/blob/main/Model(with_W%26B).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation of Pre-Requisite Libraries

In [None]:
!pip install wandb scikit-learn pandas

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
import wandb
import joblib

# Initialize W&B

In [2]:
wandb.login()

wandb.init(project="disease-detector", config={
    "model_type": "TF-IDF-Cosine",
    "rare_penalty": 0.2,
    "top_n": 5,
    "internal_n": 20,
    "max_questions": 5
})

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnathh722[0m ([33mnathh722-nit-silchar[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Before running the code below, make sure to have the csv file( from Database_Creation.ipynb) on your Colab

# Load data and vectorizer

In [3]:
df = pd.read_csv('prefinal_diseases_with_symptoms_enhanced.csv')
df['Symptoms'] = df['Symptoms'].apply(lambda x: literal_eval(x) if isinstance(x, str) else [])

vectorizer = TfidfVectorizer()
symptom_vectors = vectorizer.fit_transform(df['Symptoms'].apply(lambda x: ' '.join(x)))


# Save and log vectorizer

In [4]:
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
wandb.save("tfidf_vectorizer.pkl")

def predict_diseases(user_symptoms, top_n=5, internal_n=20):
    user_vector = vectorizer.transform([' '.join(user_symptoms)])
    similarities = cosine_similarity(user_vector, symptom_vectors).flatten()
    adjusted_scores = similarities * (1 - 0.2 * df['IsRare'])
    ranked_indices = adjusted_scores.argsort()[::-1][:internal_n]
    ranked_diseases = df.iloc[ranked_indices]['Disease Name'].tolist()
    ranked_scores = adjusted_scores[ranked_indices]
    display_scores = ranked_scores[:top_n]
    probabilities = display_scores / display_scores.sum()

    wandb.log({
        "initial_symptoms": user_symptoms,
        "mean_similarity": similarities.mean(),
        "max_adjusted_score": adjusted_scores.max(),
        "top_diseases": wandb.Table(
            columns=["Disease", "Probability", "IsRare"],
            data=[[d, p, df[df['Disease Name'] == d]['IsRare'].iloc[0]]
                  for d, p in zip(ranked_diseases[:top_n], probabilities)]
        )
    })

    return {
        'display': list(zip(ranked_diseases[:top_n], probabilities)),
        'internal': list(zip(ranked_diseases, ranked_scores))
    }

### Function For Follow-up questions

In [5]:
def ask_follow_up_questions(user_symptoms, candidate_diseases, max_questions=5):
    follow_up_symptoms = set()
    for disease, _ in candidate_diseases:
        symptoms = df[df['Disease Name'] == disease]['Symptoms'].iloc[0]
        follow_up_symptoms.update(symptoms)
    follow_up_symptoms -= set(user_symptoms)
    return list(follow_up_symptoms)[:max_questions]

# Diagnose

In [6]:
def diagnose():
    user_symptoms = input("Enter your symptoms (comma-separated): ").lower().split(',')
    user_symptoms = [s.strip() for s in user_symptoms]
    wandb.init(project="disease-detector", reinit=True)
    run_data = {
        "initial_symptoms": user_symptoms.copy(),
        "follow_up_qa": {}
    }

    result = predict_diseases(user_symptoms)
    run_data["initial_diagnosis"] = result['display']

    follow_up_symptoms = ask_follow_up_questions(user_symptoms, result['internal'])

    print("\nTop 5 possible diseases:")
    for disease, prob in result['display']:
        print(f"- {disease} ({prob:.2%})")

    if follow_up_symptoms:
        qa_pairs = []
        for symptom in follow_up_symptoms:
            answer = input(f"Do you have '{symptom}'? (yes/no): ").lower()
            qa_pairs.append((symptom, answer))
            if answer == 'yes':
                user_symptoms.append(symptom)
        table = wandb.Table(
            columns=["Question", "Answer"],
            data=[[q, a] for q, a in qa_pairs]
        )
        run_data["follow_up_qa"] = table
        updated_result = predict_diseases(user_symptoms)
        run_data["final_diagnosis"] = updated_result['display']

        print("\nFinal diagnosis:")
        for disease, prob in updated_result['display']:
            print(f"- {disease} ({prob:.2%})")
    wandb.log({
        "initial_symptoms": ", ".join(run_data["initial_symptoms"]),
        "initial_diagnosis": wandb.Table(
            columns=["Disease", "Probability"],
            data=run_data["initial_diagnosis"]
        ),
        "follow_up_qa": run_data["follow_up_qa"] if follow_up_symptoms else None,
        "final_diagnosis": wandb.Table(
            columns=["Disease", "Probability"],
            data=run_data.get("final_diagnosis", [])
        )
    })

    wandb.finish()

diagnose()

Enter your symptoms (comma-separated): sneezing, coughing



Top 5 possible diseases:
- Subconjunctival hemorrhage (broken blood vessel in eye) (29.70%)
- Nonallergic rhinitis (19.48%)
- Infectious diseases (17.93%)
- Occupational asthma (16.65%)
- Asthma attack (16.23%)
Do you have 'Violent coughing'? (yes/no): no
Do you have 'Chest tightness or pain'? (yes/no): no
Do you have 'In a child, frequent upward rubbing of the nose'? (yes/no): yes
Do you have 'Runny or stuffy nose.'? (yes/no): yes
Do you have 'Can radiate to arms and shoulders'? (yes/no): no

Final diagnosis:
- Pet allergy (21.83%)
- Dust mite allergy (21.78%)
- Nonallergic rhinitis (21.07%)
- Bronchiolitis (18.89%)
- Mold allergy (16.44%)


0,1
max_adjusted_score,▁█
mean_similarity,▁█

0,1
initial_symptoms,"sneezing, coughing"
max_adjusted_score,0.57206
mean_similarity,0.03536
