In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('FullDataset/final_dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,content,label
0,1,Electromagnetic fields (EMFs) are invisible li...,1
1,2,Cryptosporidium infection in acquired immunode...,0
2,3,"In today's world, it is not uncommon to see p...",1
3,4,Tension pneumopericardium as a complication of...,0
4,5,Urgent care center pediatric telephone advice....,0


In [4]:
df['content'][1]

'Cryptosporidium infection in acquired immunodeficiency syndrome: not always a poor prognosis. Chronic diarrhea and malabsorption accompanied by simultaneous infection with the protozoa Giardia lamblia and Cryptosporidium occurred in a 22-year-old homosexual man with antibody to human immunodeficiency virus (HIV). Small bowel biopsy demonstrated total villous atrophy and marked mononuclear infiltration in the lamina propria simulating celiac disease. Treatment with metronidazole resulted in resolution of diarrhea, clearance of parasites, and marked improvement in small bowel histology. Although diarrhea and malabsorption in immunocompromised patients with cryptosporidiosis are regarded as ominous, our patient remained disease free for the next 3 years. Thus, infection with Cryptosporidium in patients with HIV does not always lead to intractable diarrhea or death. '

In [5]:
df['label'].value_counts()

0    11550
1    11497
Name: label, dtype: int64

In [6]:
df.isnull().sum()

id         0
content    0
label      0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.duplicated().sum()

0

## Converting to lower case

In [10]:
df.head()
df['content'] = df['content'].str.lower()
df['content']

0        electromagnetic fields (emfs) are invisible li...
1        cryptosporidium infection in acquired immunode...
2         in today's world, it is not uncommon to see p...
3        tension pneumopericardium as a complication of...
4        urgent care center pediatric telephone advice....
                               ...                        
23042    the trauma triage rule: a new, resource-based ...
23043    penetrating keratoplasty for keratoconus: comp...
23044    identification of a melanoma progression antig...
23045    monoclonal antibody-purged autologous bone mar...
23046    warm heart surgery. hypothermia is widely ackn...
Name: content, Length: 23047, dtype: object

## Removing URLs

In [11]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df['content'] = df['content'].apply(lambda x: remove_url(x))
df['content']

0        electromagnetic fields (emfs) are invisible li...
1        cryptosporidium infection in acquired immunode...
2         in today's world, it is not uncommon to see p...
3        tension pneumopericardium as a complication of...
4        urgent care center pediatric telephone advice....
                               ...                        
23042    the trauma triage rule: a new, resource-based ...
23043    penetrating keratoplasty for keratoconus: comp...
23044    identification of a melanoma progression antig...
23045    monoclonal antibody-purged autologous bone mar...
23046    warm heart surgery. hypothermia is widely ackn...
Name: content, Length: 23047, dtype: object

## Remove Punctuation

In [12]:
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

exclude = string.punctuation
df['content'] = df['content'].apply(lambda x: remove_punc(x))
df['content']

0        electromagnetic fields emfs are invisible line...
1        cryptosporidium infection in acquired immunode...
2         in todays world it is not uncommon to see peo...
3        tension pneumopericardium as a complication of...
4        urgent care center pediatric telephone advice ...
                               ...                        
23042    the trauma triage rule a new resourcebased app...
23043    penetrating keratoplasty for keratoconus compl...
23044    identification of a melanoma progression antig...
23045    monoclonal antibodypurged autologous bone marr...
23046    warm heart surgery hypothermia is widely ackno...
Name: content, Length: 23047, dtype: object

## Removing stop words

In [13]:
nltk.download('stopwords')

from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in english_stopwords:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)
   
df['content'] = df['content'].apply(lambda x: remove_stopwords(x))
df['content']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ajinkya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        electromagnetic fields emfs  invisible lines  ...
1        cryptosporidium infection  acquired immunodefi...
2         todays world    uncommon  see people making p...
3        tension pneumopericardium   complication  sing...
4        urgent care center pediatric telephone advice ...
                               ...                        
23042     trauma triage rule  new resourcebased approac...
23043    penetrating keratoplasty  keratoconus complica...
23044    identification   melanoma progression antigen ...
23045    monoclonal antibodypurged autologous bone marr...
23046    warm heart surgery hypothermia  widely acknowl...
Name: content, Length: 23047, dtype: object

## Remove digits

In [14]:
df['content'] = df['content'].apply(lambda text: re.sub(r'\d+', '', text))
df.head()

Unnamed: 0,id,content,label
0,1,electromagnetic fields emfs invisible lines ...,1
1,2,cryptosporidium infection acquired immunodefi...,0
2,3,todays world uncommon see people making p...,1
3,4,tension pneumopericardium complication sing...,0
4,5,urgent care center pediatric telephone advice ...,0


In [15]:
X = df.iloc[:,1:2]
y = df['label']

In [16]:
X

Unnamed: 0,content
0,electromagnetic fields emfs invisible lines ...
1,cryptosporidium infection acquired immunodefi...
2,todays world uncommon see people making p...
3,tension pneumopericardium complication sing...
4,urgent care center pediatric telephone advice ...
...,...
23042,trauma triage rule new resourcebased approac...
23043,penetrating keratoplasty keratoconus complica...
23044,identification melanoma progression antigen ...
23045,monoclonal antibodypurged autologous bone marr...


In [17]:
y

0        1
1        0
2        1
3        0
4        0
        ..
23042    0
23043    0
23044    0
23045    0
23046    0
Name: label, Length: 23047, dtype: int64

In [18]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [19]:
y

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [21]:
X_train.shape

(18437, 1)

In [46]:
# Applying BoW
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
cv = CountVectorizer()

In [48]:
X_train_bow = cv.fit_transform(X_train['content']).toarray()
X_test_bow = cv.transform(X_test['content']).toarray()

MemoryError: Unable to allocate 6.59 GiB for an array with shape (18437, 47986) and data type int64

In [25]:
X_train_bow.shape

(18437, 47986)

# Fitting GNB

In [36]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

In [37]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.9692982456140351

# Saving the model GNB

In [43]:
from joblib import dump,load

#Save model to the file
dump(gnb,filename="gaussianNB.joblib")

['gaussianNB.joblib']

In [38]:
confusion_matrix(y_test,y_pred)

array([[1007,   23],
       [  40,  982]], dtype=int64)

# Loading the model GNB

In [65]:
from joblib import load

# Load the model
gnb = load("gaussianNB.joblib")

# Testing GNB

In [95]:
health_content = "To alleviate headaches, try staying hydrated, resting in a quiet environment, managing stress, avoiding triggers like bright lights and loud noises, taking over-the-counter pain relievers as needed, applying heat or cold packs, gentle massage, keeping a headache diary to identify triggers, staying active with regular exercise, and seeking professional help if needed."

# Preprocess the input health content (assuming you have a function for preprocessing)
health_content = remove_punc(health_content)
health_content = remove_stopwords(health_content)
health_content.lower()
# Transform the preprocessed content into a Bag-of-Words representation
content_bow = cv.transform([health_content]).toarray()

# Use the trained Gaussian Naive Bayes model to make predictions
predicted_label = gnb.predict(content_bow)

# Print the predicted label
print("Predicted Label:", predicted_label)

Predicted Label: [1]


# Classification report GNB

In [67]:
from sklearn.metrics import classification_report
y_pred = gnb.predict(X_test_bow)
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1030
           1       0.98      0.96      0.97      1022

    accuracy                           0.97      2052
   macro avg       0.97      0.97      0.97      2052
weighted avg       0.97      0.97      0.97      2052



# Fitting RF

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.9917570498915401

# Loading the model RF

In [44]:
import pickle

# Load the model
file = open("randomForestBoW",'rb')
rf = pickle.load(file)

# Testing model RF

In [49]:
health_content = "Globally, corona virus (COVID19) pandemic has become the most significant crisis to challenge the health, economy and the wellbeing of the humans affecting nearly all the countries."

# Preprocess the input health content (assuming you have a function for preprocessing)
health_content = remove_punc(health_content)
health_content = remove_stopwords(health_content)
health_content.lower()
# Transform the preprocessed content into a Bag-of-Words representation
content_bow = cv.transform([health_content]).toarray()

# Use the trained Gaussian Naive Bayes model to make predictions
predicted_label = rf.predict(content_bow)

# Print the predicted label
print("Predicted Label:", predicted_label)

Predicted Label: [0]


# Saving model RF

In [29]:
import pickle
pickle.dump(rf,open('randomForestBoW','wb'))

# Classification report for RF

In [45]:
y_pred = rf.predict(X_test_bow)
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)


NameError: name 'X_test_bow' is not defined

# Recall, percision, F1-Score

In [None]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score
)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1score = f1_score(y_test, y_pred)
 
print(f"Precision = {precision.round(4)}")
print(f"Recall = {recall.round(4)}")
print(f"F1 Score = {f1score.round(4)}")

# Confusion Matrix

In [50]:
confusion_matrix(y_test,y_pred)

array([[2259,   11],
       [  27, 2313]], dtype=int64)

# Fitting Multinomial Naive Bayes 

In [73]:
# Fitting Multinomial Naive Bayes 
from sklearn.naive_bayes import MultinomialNB
mnb_clf = MultinomialNB()
mnb_clf.fit(X_train_bow,y_train)
y_pred = mnb_clf.predict(X_test_bow)
accuracy_score(y_test,y_pred)


0.9897660818713451

# Classification report MNB

In [74]:
from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1030
           1       1.00      0.98      0.99      1022

    accuracy                           0.99      2052
   macro avg       0.99      0.99      0.99      2052
weighted avg       0.99      0.99      0.99      2052



# Saving the model MNB

In [76]:
from joblib import dump
dump(mnb_clf,filename="maivebayes.joblib")

['maivebayes.joblib']

# Confusion matrix

In [77]:
confusion_matrix(y_test,y_pred)

array([[1026,    4],
       [  17, 1005]], dtype=int64)

# Testing MNB

In [93]:
health_content = "To alleviate headaches, try staying hydrated, resting in a quiet environment, managing stress, avoiding triggers like bright lights and loud noises, taking over-the-counter pain relievers as needed, applying heat or cold packs, gentle massage, keeping a headache diary to identify triggers, staying active with regular exercise, and seeking professional help if needed."
# Preprocess the input health content (assuming you have a function for preprocessing)
health_content = remove_punc(health_content)
health_content = remove_stopwords(health_content)
health_content.lower()
# Transform the preprocessed content into a Bag-of-Words representation
content_bow = cv.transform([health_content]).toarray()

# Use the trained Gaussian Naive Bayes model to make predictions
predicted_label = mnb_clf.predict(content_bow)

# Print the predicted label
print("Predicted Label:", predicted_label)

Predicted Label: [1]


## Using TfIdf RF

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tfidf = TfidfVectorizer()

In [24]:
X_train_tfidf = tfidf.fit_transform(X_train['content']).toarray()
X_test_tfidf = tfidf.transform(X_test['content'])

In [25]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)


In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9950108459869849

# Saving the RF model

In [27]:
import pickle
pickle.dump(rf,open('randomForestTFIDF','wb'))

# Classification report RF

In [28]:
from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2270
           1       1.00      0.99      1.00      2340

    accuracy                           1.00      4610
   macro avg       0.99      1.00      1.00      4610
weighted avg       1.00      1.00      1.00      4610



# Recall, precision, F1-score

In [30]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score
)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1score = f1_score(y_test, y_pred)
 
print(f"Precision = {precision.round(4)}")
print(f"Recall = {recall.round(4)}")
print(f"F1 Score = {f1score.round(4)}")

Precision = 0.9966
Recall = 0.9936
F1 Score = 0.9951


# Load RF-TFIDF model

In [46]:
from joblib import load

# Load the model
rf_tdif = load("randomForestTFIDF.joblib")

# Conf Matrix

In [31]:
confusion_matrix(y_test,y_pred)

array([[2262,    8],
       [  15, 2325]], dtype=int64)

# Testing RF

In [41]:
health_content ='''If you are concerned about a fever, especially a high fever (over 103°F for adults or 100.4°F foThere is no cure for the common cold or flu, but there are ways to manage the symptoms and feel better while your body recovers. Here's the gist:
Get plenty of rest.
Stay hydrated with fluids like water, broth, or tea.
Over-the-counter medications can help with specific symptoms like congestion, fever, or sore throat.
Home remedies like salt water gargles, a humidifier, or warm compresses may offer relief.
If your symptoms are severe or don't improve, consult a doctor.r infants and toddlers), or if it is accompanied by other concerning symptoms,  it is best to see a doctor.'''
# Preprocess the input health content (assuming you have a function for preprocessing)
health_content = remove_punc(health_content)
health_content = remove_stopwords(health_content)
health_content.lower()
# Transform the preprocessed content into a Bag-of-Words representation
content_bow = tfidf.transform([health_content]).toarray()

# Use the trained Gaussian Naive Bayes model to make predictions
predicted_label = rf.predict(content_bow)

# Print the predicted label
print("Predicted Label:", predicted_label)

Predicted Label: [1]
