In [1]:
pip install pandas scikit-learn nltk flask

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("symptom2disease.csv")

# Explore the dataset
print(df.head())  # Check the first few rows


   Unnamed: 0      label                                               text
0           0  Psoriasis  I have been experiencing a skin rash on my arm...
1           1  Psoriasis  My skin has been peeling, especially on my kne...
2           2  Psoriasis  I have been experiencing joint pain in my fing...
3           3  Psoriasis  There is a silver like dusting on my skin, esp...
4           4  Psoriasis  My nails have small dents or pits in them, and...


In [3]:
print(df.info())  # Check for missing values and data types


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1200 non-null   int64 
 1   label       1200 non-null   object
 2   text        1200 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.3+ KB
None


In [4]:
print(df['label'].value_counts())  # Check the distribution of diseases

Psoriasis                          50
Varicose Veins                     50
peptic ulcer disease               50
drug reaction                      50
gastroesophageal reflux disease    50
allergy                            50
urinary tract infection            50
Malaria                            50
Jaundice                           50
Cervical spondylosis               50
Migraine                           50
Hypertension                       50
Bronchial Asthma                   50
Acne                               50
Arthritis                          50
Dimorphic Hemorrhoids              50
Pneumonia                          50
Common Cold                        50
Fungal infection                   50
Dengue                             50
Impetigo                           50
Chicken pox                        50
Typhoid                            50
diabetes                           50
Name: label, dtype: int64


In [None]:
#Step 3: Preprocess the Data

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [6]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akshitajain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akshitajain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# Text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)


In [8]:
# Apply cleaning to the dataset
df['cleaned_text'] = df['text'].apply(clean_text)

In [9]:
# Check the cleaned text
print(df[['text', 'cleaned_text']].head())

                                                text  \
0  I have been experiencing a skin rash on my arm...   
1  My skin has been peeling, especially on my kne...   
2  I have been experiencing joint pain in my fing...   
3  There is a silver like dusting on my skin, esp...   
4  My nails have small dents or pits in them, and...   

                                        cleaned_text  
0  experiencing skin rash arms legs torso past we...  
1  skin peeling especially knees elbows scalp pee...  
2  experiencing joint pain fingers wrists knees p...  
3  silver like dusting skin especially lower back...  
4  nails small dents pits often feel inflammatory...  


In [None]:
#Step 4: Split the Data into Training and Testing Sets

In [10]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))


Training set size: 960
Testing set size: 240


In [None]:
#Step 5: Vectorize the Text Data

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_vec = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_vec = vectorizer.transform(X_test)

print("Shape of training data:", X_train_vec.shape)
print("Shape of testing data:", X_test_vec.shape)

Shape of training data: (960, 1373)
Shape of testing data: (240, 1373)


In [30]:
# Train a Machine Learning Model

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
model = MultinomialNB()

# Train the model
model.fit(X_train_vec, y_train)


In [14]:

# Make predictions on the test set
y_pred = model.predict(X_test_vec)



In [15]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9541666666666667
Classification Report:
                                  precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                      Arthritis       1.00      1.00      1.00        10
               Bronchial Asthma       1.00      1.00      1.00        11
           Cervical spondylosis       1.00      1.00      1.00         7
                    Chicken pox       1.00      0.92      0.96        12
                    Common Cold       1.00      1.00      1.00        12
                         Dengue       0.83      0.83      0.83        12
          Dimorphic Hemorrhoids       0.78      1.00      0.88         7
               Fungal infection       1.00      1.00      1.00        13
                   Hypertension       1.00      1.00      1.00        10
                       Impetigo       1.00      1.00      1.00        11
                       Jaundice       1.00      1.00      1.00        

In [None]:
#Step 7: Save the Model and Vectorizer

In [16]:
import joblib

# Save the model
joblib.dump(model, 'symptom_checker_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [17]:
!pip install flask joblib



In [18]:
import joblib

# Load the model and vectorizer
model = joblib.load('symptom_checker_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [22]:
from flask import Flask, request, jsonify
import joblib
import threading

In [23]:
# Initialize the Flask app
app = Flask(__name__)

In [24]:
# Text cleaning function
def clean_text(text):
    import re
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

In [25]:
# Define the prediction route
@app.route('/predict', methods=['POST'])
def predict():
    # Get user input from the request
    user_input = request.json['symptoms']
    
    # Clean and preprocess the input
    cleaned_input = clean_text(user_input)
    
    # Vectorize the input
    input_vec = vectorizer.transform([cleaned_input])
    
    # Make a prediction
    prediction = model.predict(input_vec)
    
    # Return the prediction as JSON
    return jsonify({'predicted_disease': prediction[0]})

# Function to run the Flask app
def run_flask_app():
    app.run(debug=True, use_reloader=False) 

# Start the Flask app in a separate thread
flask_thread = threading.Thread(target=run_flask_app)
flask_thread.start()

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [29]:
import requests

# Define the API endpoint
url = 'http://127.0.0.1:5000/predict'

# Define the input data
data = {
    'symptoms': 'There is bruising on my legs that I cannot explain. I can see strange blood vessels below the skin.'
}

# Send a POST request to the API
response = requests.post(url, json=data)

# Print the response
print(response.json())

127.0.0.1 - - [29/Jan/2025 16:40:36] "POST /predict HTTP/1.1" 200 -


{'predicted_disease': 'Varicose Veins'}
