In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('stopwords')
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import warnings, gc

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
df = pd.read_csv('/content/Symptom2Disease.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [3]:
df = df.drop(columns=['Unnamed: 0'])

In [4]:
def clean_html(text):
  html = re.compile('<.*?>')
  return html.sub(r'',text)
def email_address(text):
  email = re.compile(r'[\w\.-]+@[\w\.-]+')
  return email.sub(r'',text)
def remove_(tweet):
  tweet = re.sub('([_]+)', "", tweet)
  return tweet
def remove_digits(text):
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]'
    return re.sub(pattern, '', text)
def remove_links(tweet):
  tweet = re.sub(r'http\S+', '', tweet)
  tweet = re.sub(r'bit.ly/\S+', '', tweet)
  tweet = tweet.strip('[link]')
  return tweet
def clean_html(text):
  html = re.compile('<.*?>')
  return html.sub(r'',text)
def remove_special_characters(text):
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
    return re.sub(pat, '', text)
def removeStopWords(str):
  cachedStopWords = set(stopwords.words("english"))
  cachedStopWords.update(('and','I','A','http','And','So','arnt','This','When','It','many','Many','so','cant','Yes','yes','No','no','These','these','mailto','regards','ayanna','like','email'))
  new_str = ' '.join([word for word in str.split() if word not in cachedStopWords])
  return new_str
def non_ascii(s):
  return "".join(i for i in s if ord(i)<128)
def punct(text):
  token=RegexpTokenizer(r'\w+')#regex
  text = token.tokenize(text)
  text= " ".join(text)
  return text
def non_ascii(s):
  return "".join(i for i in s if ord(i)<128)
def lower(text):
  return text.lower()

In [5]:
df = df.sample(frac=0.5).reset_index(drop=True)
df.head()

Unnamed: 0,label,text
0,Bronchial Asthma,I've been struggling with difficulty breathing...
1,Cervical spondylosis,"Back pain, a coughing cough, and numbness in m..."
2,Fungal infection,"Doctor, My skin is covered in an uncomfortable..."
3,peptic ulcer disease,My bowel motions have changed; they've been co...
4,Cervical spondylosis,"I have been suffering from back pain, a hackin..."


In [6]:
df.duplicated().sum()

10

In [7]:
df = df.drop_duplicates()

In [9]:
def clean_text(df,col):
    df[col] = df[col].apply(func=clean_html)
    df[col] = df[col].apply(func=email_address)
    df[col] = df[col].apply(func=remove_)
    df[col] = df[col].apply(func=remove_digits)
    df[col] = df[col].apply(func=remove_links)
    df[col] = df[col].apply(func=remove_special_characters)
    df[col] = df[col].apply(func=removeStopWords)
    df[col] = df[col].apply(func=non_ascii)
    df[col] = df[col].apply(func=punct)
    df[col] = df[col].apply(func=lower)
    return df

In [10]:
preprocessed_df = clean_text(df,'text')
preprocessed_df.head()

Unnamed: 0,label,text
0,Bronchial Asthma,i ve struggling difficulty breathing constant ...
1,Cervical spondylosis,back pain coughing cough numbness arms legs pl...
2,Fungal infection,doctor my skin covered uncomfortable rash odd ...
3,peptic ulcer disease,my bowel motions changed they ve constipated d...
4,Cervical spondylosis,suffering back pain hacking cough weakness arm...


In [11]:
from collections import  Counter
corpus = []
for x in df['text'].str.split():
    corpus.extend(x)
counter=Counter(corpus)
most=counter.most_common()
print(most[0:10])

[('i', 419), ('my', 313), ('ve', 294), ('skin', 194), ('also', 190), ('lot', 175), ('really', 161), ('pain', 135), ('m', 125), ('feel', 112)]


In [13]:
first_n = 25
x, y= [], []
for word,count in most[:first_n]:
        x.append(word)
        y.append(count)
print(f"{first_n} most frequently occurring words in symptom descriptions")

25 most frequently occurring words in symptom descriptions


In [14]:
stop=set(stopwords.words('english'))

In [15]:
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Tokenizacja
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

preprocessed_symptoms = preprocessed_df['text'].apply(preprocess_text)
preprocessed_symptoms.head()

0    struggling difficulty breathing constant cough...
1    back pain coughing cough numbness arms legs pl...
2    doctor skin covered uncomfortable rash odd pat...
3    bowel motions changed constipated diarrhoeal l...
4    suffering back pain hacking cough weakness arm...
Name: text, dtype: object

In [16]:
tfidf_vectorizer = TfidfVectorizer(max_features=1500)
tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_symptoms).toarray()

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, df['label'] , test_size=0.2, random_state=42)

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

In [17]:
predictions = knn_classifier.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, predictions))

Accuracy: 0.85
                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         9
                      Arthritis       1.00      1.00      1.00         9
               Bronchial Asthma       0.88      1.00      0.93         7
           Cervical spondylosis       0.75      1.00      0.86         3
                    Chicken pox       0.40      0.50      0.44         4
                    Common Cold       0.57      1.00      0.73         4
                         Dengue       0.50      0.50      0.50         2
          Dimorphic Hemorrhoids       1.00      1.00      1.00         5
               Fungal infection       1.00      1.00      1.00         4
                   Hypertension       0.86      1.00      0.92         6
                       Impetigo       1.00      0.83      0.91         6
                       Jaundice       0.50      1.00      0.67         1
                        Malaria    

In [18]:
import joblib

joblib.dump(knn_classifier, 'knn_classifier.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']