In [2]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Symptom2Disease.csv')

In [3]:
df.isna().sum()

label    0
text     0
dtype: int64

In [4]:
display(df.duplicated().sum())

47

In [5]:
df.drop_duplicates

<bound method DataFrame.drop_duplicates of                     label                                               text
0               Psoriasis  I have been experiencing a skin rash on my arm...
1               Psoriasis  My skin has been peeling, especially on my kne...
2               Psoriasis  I have been experiencing joint pain in my fing...
3               Psoriasis  There is a silver like dusting on my skin, esp...
4               Psoriasis  My nails have small dents or pits in them, and...
...                   ...                                                ...
1252  Alcoholic hepatitis  symptoms such as confusion, fluid retention, a...
1253         Heart attack  Symptoms typically include chest pain or disco...
1254         Heart attack  This pain may radiate to the arms, neck, jaw, ...
1255      Gastroenteritis  Symptoms typically include diarrhea, which may...
1256      Gastroenteritis  symptoms such as headache, muscle aches, and f...

[1257 rows x 2 columns]>

In [6]:
df['label'] = df['label'].str.lower().str.strip()
df['text'] = df['text'].str.lower().str.strip()

In [7]:
import string
df['label'] = df['label'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [8]:
from nltk.tokenize import word_tokenize
df['text'] = df['text'].apply(word_tokenize)

In [9]:
from nltk.corpus import stopwords

In [10]:
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])

In [11]:
from nltk.stem import WordNetLemmatizer

In [12]:
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [14]:
pip install textblob


Collecting textblobNote: you may need to restart the kernel to use updated packages.

  Using cached textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Using cached textblob-0.18.0.post0-py3-none-any.whl (626 kB)
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0


In [15]:
pip install pyenchant

Collecting pyenchantNote: you may need to restart the kernel to use updated packages.

  Using cached pyenchant-3.2.2-py3-none-win_amd64.whl.metadata (3.8 kB)
Using cached pyenchant-3.2.2-py3-none-win_amd64.whl (11.9 MB)
Installing collected packages: pyenchant
Successfully installed pyenchant-3.2.2


In [13]:
import enchant

In [14]:
spell_checker = enchant.Dict("en_US")
df['text'] = df['text'].apply(lambda x: [word for word in x if spell_checker.check(word)])

In [15]:
print(df.head(20))

        label                                               text
0   psoriasis  [experiencing, skin, rash, arm, leg, torso, pa...
1   psoriasis  [skin, peeling, especially, knee, elbow, scalp...
2   psoriasis  [experiencing, joint, pain, finger, wrist, kne...
3   psoriasis  [silver, like, dusting, skin, especially, lowe...
4   psoriasis  [nail, small, dent, pit, often, feel, inflamma...
5   psoriasis  [skin, palm, sol, thickened, deep, crack, crac...
6   psoriasis  [skin, around, mouth, nose, eye, red, inflamed...
7   psoriasis  [skin, sensitive, reacts, easily, change, temp...
8   psoriasis  [noticed, sudden, peeling, skin, different, pa...
9   psoriasis  [skin, genitals, red, inflamed, often, itchy, ...
10  psoriasis  [experienced, fatigue, general, feeling, malai...
11  psoriasis  [rash, skin, spread, part, body, including, ch...
12  psoriasis  [rash, skin, worse, winter, month, air, dry, f...
13  psoriasis  [experienced, difficulty, sleeping, due, itchi...
14  psoriasis  [skin, pro

In [16]:

print(df[['label', 'text']])


                    label                                               text
0               psoriasis  [experiencing, skin, rash, arm, leg, torso, pa...
1               psoriasis  [skin, peeling, especially, knee, elbow, scalp...
2               psoriasis  [experiencing, joint, pain, finger, wrist, kne...
3               psoriasis  [silver, like, dusting, skin, especially, lowe...
4               psoriasis  [nail, small, dent, pit, often, feel, inflamma...
...                   ...                                                ...
1252  alcoholic hepatitis  [symptom, confusion, fluid, retention, bleedin...
1253         heart attack  [symptom, typically, include, chest, pain, dis...
1254         heart attack  [pain, may, radiate, arm, neck, jaw, back, abd...
1255      gastroenteritis  [symptom, typically, include, diarrhea, may, w...
1256      gastroenteritis  [symptom, headache, muscle, ache, fatigue, may...

[1257 rows x 2 columns]


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000) 
X = tfidf_vectorizer.fit_transform(df['text'].astype(str))
y = df['label']

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000) 
X = tfidf_vectorizer.fit_transform(df['text'].astype(str))
y = df['label']


In [19]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [21]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')  
])


model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)


test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 31ms/step - accuracy: 0.0767 - loss: 3.6378 - val_accuracy: 0.1881 - val_loss: 3.5055
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2374 - loss: 3.3820 - val_accuracy: 0.3069 - val_loss: 2.9850
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4405 - loss: 2.6937 - val_accuracy: 0.7129 - val_loss: 2.2505
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8555 - loss: 1.9025 - val_accuracy: 0.7921 - val_loss: 1.5495
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9125 - loss: 1.1564 - val_accuracy: 0.8713 - val_loss: 1.0550
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9497 - loss: 0.6440 - val_accuracy: 0.8911 - val_loss: 0.7946
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━

In [22]:
input
user_input = "I've been grumpy and gloomy lately, and I've also noticed a change in my vision."
print("User Input (Before Preprocessing):", user_input)

def preprocess_text(text):
    if isinstance(text, str):  # (not NaN)
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return tokens
    else:
        return []
  
processed_input = preprocess_text(user_input)  
print("Processed Input:", processed_input)


if isinstance(processed_input, list):
    processed_input = ' '.join(processed_input)
    print("Processed Input (Joined):", processed_input)


user_input_vector = tfidf_vectorizer.transform([processed_input])


from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(user_input_vector, X) 

print("Similarity Scores:", similarity_scores)


matched_index = np.argmax(similarity_scores)
print("Matched Index:", matched_index)

matched_disease = y[matched_index]
print("Matched Disease:", matched_disease)


User Input (Before Preprocessing): I've been grumpy and gloomy lately, and I've also noticed a change in my vision.
Processed Input: ['I', "'ve", 'grumpy', 'gloomy', 'lately', ',', 'I', "'ve", 'also', 'noticed', 'change', 'vision', '.']
Processed Input (Joined): I 've grumpy gloomy lately , I 've also noticed change vision .
Similarity Scores: [[0.         0.         0.         ... 0.02455696 0.02518225 0.04224089]]
Matched Index: 707
Matched Disease: migraine


In [1]:
df_second = pd.read_csv('symptom_precaution.csv')
df_second

NameError: name 'pd' is not defined

In [24]:
df_second['Disease'] = df_second['Disease'].str.lower().str.strip()
df_second['Precaution_1'] = df_second['Precaution_1'].str.lower().str.strip()
df_second['Precaution_2'] = df_second['Precaution_2'].str.lower().str.strip()
df_second['Precaution_3'] = df_second['Precaution_3'].str.lower().str.strip()
df_second['Precaution_4'] = df_second['Precaution_4'].str.lower().str.strip()


In [25]:

df_second['Disease'] = df_second['Disease'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df_second['Precaution_1'] = df_second['Precaution_1'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df_second['Precaution_2'] = df_second['Precaution_2'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [26]:
df_second['Precaution_3'] = df_second['Precaution_3'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
df_second['Precaution_4'] = df_second['Precaution_4'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
# Handle NaN values

In [27]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np


# Tokenization 
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):  # (not NaN)
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return tokens
    else:
        return []

df_second['Precaution_1_tokens'] = df_second['Precaution_1'].apply(preprocess_text)
df_second['Precaution_2_tokens'] = df_second['Precaution_2'].apply(preprocess_text)
df_second['Precaution_3_tokens'] = df_second['Precaution_3'].apply(preprocess_text)
df_second['Precaution_4_tokens'] = df_second['Precaution_4'].apply(preprocess_text)


df_second.fillna('', inplace=True)
df_second

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4,Precaution_1_tokens,Precaution_2_tokens,Precaution_3_tokens,Precaution_4_tokens
0,drug reaction,stop irritation,consult nearest hospital,stop taking drug,follow up,"[stop, irritation]","[consult, nearest, hospital]","[stop, taking, drug]",[follow]
1,malaria,consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out,"[consult, nearest, hospital]","[avoid, oily, food]","[avoid, non, veg, food]","[keep, mosquito]"
2,allergy,apply calamine,cover area with bandage,,use ice to compress itching,"[apply, calamine]","[cover, area, bandage]",[],"[use, ice, compress, itching]"
3,hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep,"[reduce, stress]",[exercise],"[eat, healthy]","[get, proper, sleep]"
4,psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths,"[wash, hand, warm, soapy, water]","[stop, bleeding, using, pressure]","[consult, doctor]","[salt, bath]"
5,gerd,avoid fatty spicy food,avoid lying down after eating,maintain healthy weight,exercise,"[avoid, fatty, spicy, food]","[avoid, lying, eating]","[maintain, healthy, weight]",[exercise]
6,chronic cholestasis,cold baths,anti itch medicine,consult doctor,eat healthy,"[cold, bath]","[anti, itch, medicine]","[consult, doctor]","[eat, healthy]"
7,hepatitis a,consult nearest hospital,wash hands through,avoid fatty spicy food,medication,"[consult, nearest, hospital]","[wash, hand]","[avoid, fatty, spicy, food]",[medication]
8,osteoarthristis,acetaminophen,consult nearest hospital,follow up,salt baths,[acetaminophen],"[consult, nearest, hospital]",[follow],"[salt, bath]"
9,vertigo paroymsal positional vertigo,lie down,avoid sudden change in body,avoid abrupt head movment,relax,[lie],"[avoid, sudden, change, body]","[avoid, abrupt, head, movment]",[relax]


In [28]:
print(df_second.columns)


Index(['Disease', 'Precaution_1', 'Precaution_2', 'Precaution_3',
       'Precaution_4', 'Precaution_1_tokens', 'Precaution_2_tokens',
       'Precaution_3_tokens', 'Precaution_4_tokens'],
      dtype='object')


In [29]:

matched_row = df_second[df_second['Disease'] == matched_disease]

if not matched_row.empty:
  
    precaution_1 = matched_row['Precaution_1'].values[0]
    precaution_2 = matched_row['Precaution_2'].values[0]
    precaution_3 = matched_row['Precaution_3'].values[0]
    precaution_4 = matched_row['Precaution_4'].values[0]

    print("Actions/Precautions for", matched_disease)
    print("1. ", precaution_1)
    print("2. ", precaution_2)
    print("3. ", precaution_3)
    print("4. ", precaution_4)
else:
    print("No actions or precautions found for", matched_disease)



Actions/Precautions for migraine
1.  meditation
2.  reduce stress
3.  use poloroid glasses in sun
4.  consult doctor


In [30]:
from sklearn.metrics.pairwise import cosine_similarity

def chatbot_response(user_input):
    user_input = user_input.lower().strip()
    
    if user_input == "hi" or user_input == "hello":
        return "Hello! How can I assist you today?"
    
    elif user_input == "bye" or user_input == "goodbye":
        return "Goodbye! Have a great day!"
    
    elif user_input == "how are you":
        return "I'm just a computer program, so I don't have feelings, but I'm here to assist you!"
    
    elif user_input == "thank you" or user_input == "thanks":
        return "You're welcome! If you have any more questions, feel free to ask."
    
    else:
      
        user_input_vector = tfidf_vectorizer.transform([user_input])
        similarity_scores = cosine_similarity(user_input_vector, X)
        matched_index = np.argmax(similarity_scores)
        
        if similarity_scores[0][matched_index] > 0.7:  
            matched_disease = y[matched_index]
            disease_prob = clf.predict_proba(user_input_vector)[0][np.argmax(clf.classes_ == matched_disease)]
           
            actions_available = matched_disease in df_second['Disease'].values
            if actions_available:
                return f"The matched disease for your description is: {matched_disease}. Do you want to know what actions/precautions you should take? (yes/no)"
            
            else:
                return f"The matched disease for your description is: {matched_disease}. However, no actions/precautions are available for this disease. Do you want to know what actions/precautions you should take? (yes/no)"
        
        else:
            return "Sorry, the disease corresponding to your description was not found in the dataset."


print("Chatbot: Hello! How can I assist you today? Type 'bye' to exit.")

while True:
    user_input = input("You: ")
    
    if user_input.lower().strip() == 'bye':
        print("Chatbot: Goodbye! Have a great day!")
        break
    
    response = chatbot_response(user_input)
    print("Chatbot:", response)

    if "actions/precautions" in response.lower():
        user_response = input("You: ")
        
        if user_response.lower().strip() == "yes":
         
            matched_row = df_second[df_second['Disease'] == matched_disease]
            
            if not matched_row.empty:
                precaution_1 = matched_row['Precaution_1'].values[0]
                precaution_2 = matched_row['Precaution_2'].values[0]
                precaution_3 = matched_row['Precaution_3'].values[0]
                precaution_4 = matched_row['Precaution_4'].values[0]

                print("Actions/Precautions for", matched_disease)
                print("1. ", precaution_1)
                print("2. ", precaution_2)
                print("3. ", precaution_3)
                print("4. ", precaution_4)
            
            else:
                print("No actions or precautions found for", matched_disease)
        
        elif user_response.lower().strip() == "no":
            print("Chatbot: Okay, if you have any other questions, feel free to ask.")
        
        else:
            print("Chatbot: I'm sorry, I didn't understand that. Please answer with 'yes' or 'no'.")

Chatbot: Hello! How can I assist you today? Type 'bye' to exit.


You:  hi


Chatbot: Hello! How can I assist you today?


You:  i have an issuse in mt teeth


Chatbot: Sorry, the disease corresponding to your description was not found in the dataset.


You:  My skin has been peeling, especially on my knees, elbows, and scalp. This peeling is often accompanied by a burning or stinging sensation.


NameError: name 'clf' is not defined