In [3]:
import numpy as np
import pandas as pd
import re
import string
import nltk

In [4]:
df = pd.read_csv('FullDataset/full_dataset.csv')

In [5]:
df.head()

Unnamed: 0,id,content,label
0,3447,Chronic kidney disease (CKD) is a progressive ...,1
1,2951,Postural orthostatic tachycardia syndrome (POT...,1
2,5090,Vitamin B12 is a water-soluble vitamin that is...,1
3,3453,Diclofenac for day-care arthroscopy surgery: c...,0
4,5057,Attributes and survival patterns of multiple p...,0


In [6]:
df['content'][1]

"Postural orthostatic tachycardia syndrome (POTS) is a disorder that affects the autonomic nervous system. The autonomic nervous system controls involuntary functions such as heart rate, blood pressure, and sweating. People with POTS experience an increase in heart rate and blood pressure when they stand up. This can cause symptoms such as lightheadedness, fainting, and fatigue.\n\nObstructive sleep apnea (OSA) is a sleep disorder that occurs when a person's breathing is interrupted during sleep. This can happen when the muscles in the back of the throat relax and block the airway. OSA can cause a person to stop breathing for short periods of time, which can lead to low levels of oxygen in the blood.\n\nThere is some evidence that OSA may worsen the symptoms of POTS. One study found that people with POTS who also had OSA were more likely to experience lightheadedness and fainting than people with POTS who did not have OSA. Another study found that people with POTS who had OSA had lower

In [7]:
df['label'].value_counts()

1    5128
0    5128
Name: label, dtype: int64

In [8]:
df.isnull().sum()

id         0
content    0
label      0
dtype: int64

In [9]:
df.duplicated().sum()

0

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.duplicated().sum()

0

# Converting to lower case

In [12]:
df.head()
df['content'] = df['content'].str.lower()
df['content']

0        chronic kidney disease (ckd) is a progressive ...
1        postural orthostatic tachycardia syndrome (pot...
2        vitamin b12 is a water-soluble vitamin that is...
3        diclofenac for day-care arthroscopy surgery: c...
4        attributes and survival patterns of multiple p...
                               ...                        
10251    human t cell lymphotropic virus infection in g...
10252    chronic kidney disease (ckd) is a progressive ...
10253    obstructive sleep apnea (osa) is a common slee...
10254    obstructive sleep apnea (osa) and fibromyalgia...
10255    parascapular free flaps for head and neck reco...
Name: content, Length: 10256, dtype: object

# Removing URLs

In [13]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df['content'] = df['content'].apply(lambda x: remove_url(x))
df['content']

0        chronic kidney disease (ckd) is a progressive ...
1        postural orthostatic tachycardia syndrome (pot...
2        vitamin b12 is a water-soluble vitamin that is...
3        diclofenac for day-care arthroscopy surgery: c...
4        attributes and survival patterns of multiple p...
                               ...                        
10251    human t cell lymphotropic virus infection in g...
10252    chronic kidney disease (ckd) is a progressive ...
10253    obstructive sleep apnea (osa) is a common slee...
10254    obstructive sleep apnea (osa) and fibromyalgia...
10255    parascapular free flaps for head and neck reco...
Name: content, Length: 10256, dtype: object

# Remove Punctuation

In [14]:
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

exclude = string.punctuation
df['content'] = df['content'].apply(lambda x: remove_punc(x))
df['content']

0        chronic kidney disease ckd is a progressive lo...
1        postural orthostatic tachycardia syndrome pots...
2        vitamin b12 is a watersoluble vitamin that is ...
3        diclofenac for daycare arthroscopy surgery com...
4        attributes and survival patterns of multiple p...
                               ...                        
10251    human t cell lymphotropic virus infection in g...
10252    chronic kidney disease ckd is a progressive co...
10253    obstructive sleep apnea osa is a common sleep ...
10254    obstructive sleep apnea osa and fibromyalgia a...
10255    parascapular free flaps for head and neck reco...
Name: content, Length: 10256, dtype: object

# Removing stop words

In [15]:
nltk.download('stopwords')

from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in english_stopwords:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)
   
df['content'] = df['content'].apply(lambda x: remove_stopwords(x))
df['content']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ajinkya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        chronic kidney disease ckd   progressive loss ...
1        postural orthostatic tachycardia syndrome pots...
2        vitamin b12   watersoluble vitamin   essential...
3        diclofenac  daycare arthroscopy surgery compar...
4        attributes  survival patterns  multiple primar...
                               ...                        
10251    human  cell lymphotropic virus infection  guay...
10252    chronic kidney disease ckd   progressive condi...
10253    obstructive sleep apnea osa   common sleep dis...
10254    obstructive sleep apnea osa  fibromyalgia   co...
10255    parascapular free flaps  head  neck reconstruc...
Name: content, Length: 10256, dtype: object

# Remove digits

In [16]:
df['content'] = df['content'].apply(lambda text: re.sub(r'\d+', '', text))
df.head()

Unnamed: 0,id,content,label
0,3447,chronic kidney disease ckd progressive loss ...,1
1,2951,postural orthostatic tachycardia syndrome pots...,1
2,5090,vitamin b watersoluble vitamin essential ...,1
3,3453,diclofenac daycare arthroscopy surgery compar...,0
4,5057,attributes survival patterns multiple primar...,0


In [17]:
X = df.iloc[:,1:2]
y = df['label']

In [18]:
X

Unnamed: 0,content
0,chronic kidney disease ckd progressive loss ...
1,postural orthostatic tachycardia syndrome pots...
2,vitamin b watersoluble vitamin essential ...
3,diclofenac daycare arthroscopy surgery compar...
4,attributes survival patterns multiple primar...
...,...
10251,human cell lymphotropic virus infection guay...
10252,chronic kidney disease ckd progressive condi...
10253,obstructive sleep apnea osa common sleep dis...
10254,obstructive sleep apnea osa fibromyalgia co...


In [19]:
y

0        1
1        1
2        1
3        0
4        0
        ..
10251    0
10252    1
10253    1
10254    1
10255    0
Name: label, Length: 10256, dtype: int64

In [20]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [21]:
y

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [23]:
X_train.shape

(8204, 1)

In [38]:
# Applying BoW
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
cv = CountVectorizer()

In [40]:
X_train_bow = cv.fit_transform(X_train['content']).toarray()
X_test_bow = cv.transform(X_test['content']).toarray()

In [61]:
X_train_bow.shape

(8204, 29682)

# Fitting GNB

In [36]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

In [37]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.9692982456140351

# Saving the model GNB

In [43]:
from joblib import dump,load

#Save model to the file
dump(gnb,filename="gaussianNB.joblib")

['gaussianNB.joblib']

In [38]:
confusion_matrix(y_test,y_pred)

array([[1007,   23],
       [  40,  982]], dtype=int64)

# Loading the model GNB

In [65]:
from joblib import load

# Load the model
gnb = load("gaussianNB.joblib")

# Testing GNB

In [95]:
health_content = "To alleviate headaches, try staying hydrated, resting in a quiet environment, managing stress, avoiding triggers like bright lights and loud noises, taking over-the-counter pain relievers as needed, applying heat or cold packs, gentle massage, keeping a headache diary to identify triggers, staying active with regular exercise, and seeking professional help if needed."

# Preprocess the input health content (assuming you have a function for preprocessing)
health_content = remove_punc(health_content)
health_content = remove_stopwords(health_content)
health_content.lower()
# Transform the preprocessed content into a Bag-of-Words representation
content_bow = cv.transform([health_content]).toarray()

# Use the trained Gaussian Naive Bayes model to make predictions
predicted_label = gnb.predict(content_bow)

# Print the predicted label
print("Predicted Label:", predicted_label)

Predicted Label: [1]


# Classification report GNB

In [67]:
from sklearn.metrics import classification_report
y_pred = gnb.predict(X_test_bow)
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1030
           1       0.98      0.96      0.97      1022

    accuracy                           0.97      2052
   macro avg       0.97      0.97      0.97      2052
weighted avg       0.97      0.97      0.97      2052



# Fitting RF

In [49]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.9897660818713451

# Loading the model RF

In [35]:
from joblib import load

# Load the model
rf = load("randomForest.joblib")

# Testing model RF

In [71]:
health_content = "On examination, the signs exhibited may include pallor (pale skin, mucosa, conjunctiva and nail beds), but this is not a reliable sign. A blue coloration of the sclera may be noticed in some cases of iron-deficiency anemia.[24] There may be signs of specific causes of anemia, e.g. koilonychia (in iron deficiency), jaundice (when anemia results from abnormal break down of red blood cells – in hemolytic anemia), nerve cell damage (vitamin B12 deficiency), bone deformities (found in thalassemia major) or leg ulcers (seen in sickle-cell disease). In severe anemia, there may be signs of a hyperdynamic circulation: tachycardia (a fast heart rate), bounding pulse, flow murmurs, and cardiac ventricular hypertrophy (enlargement). There may be signs of heart failure. Pica, the consumption of non-food items such as ice, paper, wax, grass, hair or dirt, may be a symptom of iron deficiency;[25] although it occurs often in those who have normal levels of hemoglobin. Chronic anemia may result in behavioral disturbances in children as a direct result of impaired neurological development in infants, and reduced academic performance in children of school age. Restless legs syndrome is more common in people with iron-deficiency anemia than in the general population."

# Preprocess the input health content (assuming you have a function for preprocessing)
health_content = remove_punc(health_content)
health_content = remove_stopwords(health_content)
health_content.lower()
# Transform the preprocessed content into a Bag-of-Words representation
content_bow = cv.transform([health_content]).toarray()

# Use the trained Gaussian Naive Bayes model to make predictions
predicted_label = rf.predict(content_bow)

# Print the predicted label
print("Predicted Label:", predicted_label)

Predicted Label: [1]


# Saving model RF

In [55]:
dump(rf,filename="randomForest.joblib")

['randomForest.joblib']

# Classification report for RF

In [78]:
from sklearn.metrics import classification_report
y_pred = rf.predict(X_test_bow)
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1030
           1       1.00      0.98      0.99      1022

    accuracy                           0.99      2052
   macro avg       0.99      0.99      0.99      2052
weighted avg       0.99      0.99      0.99      2052



# Confusion Matrix

In [79]:
confusion_matrix(y_test,y_pred)

array([[1027,    3],
       [  18, 1004]], dtype=int64)

# Fitting Multinomial Naive Bayes 

In [73]:
# Fitting Multinomial Naive Bayes 
from sklearn.naive_bayes import MultinomialNB
mnb_clf = MultinomialNB()
mnb_clf.fit(X_train_bow,y_train)
y_pred = mnb_clf.predict(X_test_bow)
accuracy_score(y_test,y_pred)


0.9897660818713451

# Classification report MNB

In [74]:
from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1030
           1       1.00      0.98      0.99      1022

    accuracy                           0.99      2052
   macro avg       0.99      0.99      0.99      2052
weighted avg       0.99      0.99      0.99      2052



# Saving the model MNB

In [76]:
from joblib import dump
dump(mnb_clf,filename="maivebayes.joblib")

['maivebayes.joblib']

# Confusion matrix

In [77]:
confusion_matrix(y_test,y_pred)

array([[1026,    4],
       [  17, 1005]], dtype=int64)

# Testing MNB

In [93]:
health_content = "To alleviate headaches, try staying hydrated, resting in a quiet environment, managing stress, avoiding triggers like bright lights and loud noises, taking over-the-counter pain relievers as needed, applying heat or cold packs, gentle massage, keeping a headache diary to identify triggers, staying active with regular exercise, and seeking professional help if needed."
# Preprocess the input health content (assuming you have a function for preprocessing)
health_content = remove_punc(health_content)
health_content = remove_stopwords(health_content)
health_content.lower()
# Transform the preprocessed content into a Bag-of-Words representation
content_bow = cv.transform([health_content]).toarray()

# Use the trained Gaussian Naive Bayes model to make predictions
predicted_label = mnb_clf.predict(content_bow)

# Print the predicted label
print("Predicted Label:", predicted_label)

Predicted Label: [1]


## Using TfIdf RF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
tfidf = TfidfVectorizer()

In [26]:
X_train_tfidf = tfidf.fit_transform(X_train['content']).toarray()
X_test_tfidf = tfidf.transform(X_test['content'])

In [28]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)


NameError: name 'accuracy_score' is not defined

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9907407407407407

# Saving the RF model

In [30]:
from joblib import dump,load

#Save model to the file
dump(rf,filename="randomForestTFIDF.joblib")

['randomForestTFIDF.joblib']

# Classification report RF

In [31]:
from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1030
           1       0.99      0.99      0.99      1022

    accuracy                           0.99      2052
   macro avg       0.99      0.99      0.99      2052
weighted avg       0.99      0.99      0.99      2052



In [46]:
from joblib import load

# Load the model
rf_tdif = load("randomForestTFIDF.joblib")

# Testing RF

In [69]:
health_content = "Symptoms of anemia can come on quickly or slowly.[21] Early on there may be few or no symptoms.[21] If the anemia continues slowly (chronic), the body may adapt and compensate for this change. In this case, no symptoms may appear until the anemia becomes more severe."
# Preprocess the input health content (assuming you have a function for preprocessing)
health_content = remove_punc(health_content)
health_content = remove_stopwords(health_content)
health_content.lower()
# Transform the preprocessed content into a Bag-of-Words representation
content_bow = tfidf.transform([health_content]).toarray()

# Use the trained Gaussian Naive Bayes model to make predictions
predicted_label = rf_tdif.predict(content_bow)

# Print the predicted label
print("Predicted Label:", predicted_label)

Predicted Label: [0]
