# **Import Libraries**

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import sklearn

# Necessary Libraries for Data Preparation
import string
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Necessary Libraries for ML Models
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Necessary Libraries for Accuracy Measures
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Necessary Libraries for Deployment
import joblib

**Download NLTK resources**

In [2]:
# download the Punkt tokenizer models
nltk.download('punkt')

# download a list of common stopwords
nltk.download('stopwords')

# download the WordNet lexical database
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# **Read the Data**

In [3]:
data = pd.read_csv('Symptom2Disease.csv')

# **Understand and Clean the Data**

In [4]:
data

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been encountering a skin hasty on my ar...
1,1,Psoriasis,"My skin has been peeling, particularly on my k..."
2,2,Psoriasis,I have been encountering joint torment in my f...
3,3,Psoriasis,"There's a silver like cleaning on my skin, par..."
4,4,Psoriasis,"My nails have little marks or pits in them, an..."
...,...,...,...
3595,3595,diabetes,These strong desires and the need to urinate o...
3596,3596,diabetes,"I have trouble breathing, especially outside. ..."
3597,3597,diabetes,"I find it difficult to breathe, especially out..."
3598,3598,diabetes,I constantly sneeze and have a dry cough. My i...


In [5]:
# Drop the 'Unnamed: 0' column
data.drop(columns = ["Unnamed: 0"], inplace = True)
data

Unnamed: 0,label,text
0,Psoriasis,I have been encountering a skin hasty on my ar...
1,Psoriasis,"My skin has been peeling, particularly on my k..."
2,Psoriasis,I have been encountering joint torment in my f...
3,Psoriasis,"There's a silver like cleaning on my skin, par..."
4,Psoriasis,"My nails have little marks or pits in them, an..."
...,...,...
3595,diabetes,These strong desires and the need to urinate o...
3596,diabetes,"I have trouble breathing, especially outside. ..."
3597,diabetes,"I find it difficult to breathe, especially out..."
3598,diabetes,I constantly sneeze and have a dry cough. My i...


In [6]:
# Concise summary of the DataFrame's structure and content
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3600 non-null   object
 1   text    3600 non-null   object
dtypes: object(2)
memory usage: 56.4+ KB


In [7]:
data.columns

Index(['label', 'text'], dtype='object')

In [8]:
data.shape

(3600, 2)

In [9]:
# Count the number of unique values in each column
data.nunique()

label      24
text     3510
dtype: int64

In [10]:
data.value_counts().sum()

3600

In [11]:
# Check and Count null values
data.isnull().sum()

label    0
text     0
dtype: int64

In [12]:
# Check and Count duplicated values
data.duplicated().sum()

90

In [13]:
# Drop duplicated values
data.drop_duplicates(inplace = True)
data

Unnamed: 0,label,text
0,Psoriasis,I have been encountering a skin hasty on my ar...
1,Psoriasis,"My skin has been peeling, particularly on my k..."
2,Psoriasis,I have been encountering joint torment in my f...
3,Psoriasis,"There's a silver like cleaning on my skin, par..."
4,Psoriasis,"My nails have little marks or pits in them, an..."
...,...,...
3595,diabetes,These strong desires and the need to urinate o...
3596,diabetes,"I have trouble breathing, especially outside. ..."
3597,diabetes,"I find it difficult to breathe, especially out..."
3598,diabetes,I constantly sneeze and have a dry cough. My i...


# **Text Preprocessing--->(NLP)**

In [14]:
def lowercase_text(text):
    return text.lower()

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punct = text.translate(translator).strip()
    return text_without_punct

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return filtered_tokens

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [15]:
# Preprocessing Container function

def preprocess_text(text):
    text = lowercase_text(text)
    text = remove_punctuation(text)
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_text(tokens)
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [16]:
# Apply Preprocessing Container function to symptoms

data['text'] = data['text'].apply(preprocess_text)

In [17]:
# Extract and Count unique dictionary vocabs

def count_unique_vocab(count):
    unique_vocabularies = set()
    for text in count:
        words = text.split()
        for word in words:
            unique_vocabularies.add(word)
    return len(unique_vocabularies)

# Count unique dictionary vocabs
num_unique_vocabs = count_unique_vocab(data['text'])

print("Number of unique dictionary vocabs:", num_unique_vocabs)

Number of unique dictionary vocabs: 2275


# **Select the features (X) as 'text' column and target (y) as 'label' column**

In [18]:
X = data['text']
y = data['label']

In [19]:
X

0       encountering skin hasty arm leg middle past we...
1       skin peeling particularly knee elbow scalp pee...
2       encountering joint torment finger wrist knee t...
3       there silver like cleaning skin particularly l...
4       nail little mark pit frequently feel incendiar...
                              ...                        
3595    strong desire need urinate occur daily basis o...
3596    trouble breathing especially outside start fee...
3597    find difficult breathe especially outside heat...
3598    constantly sneeze dry cough infection dont see...
3599    dry cough sneeze lot palpitation infection don...
Name: text, Length: 3510, dtype: object

In [20]:
y

0       Psoriasis
1       Psoriasis
2       Psoriasis
3       Psoriasis
4       Psoriasis
          ...    
3595     diabetes
3596     diabetes
3597     diabetes
3598     diabetes
3599     diabetes
Name: label, Length: 3510, dtype: object

In [21]:
# The 'shuffle' function is used to randomly Shuffle/Rearrange the elements of a dataset
from sklearn.utils import shuffle
data = shuffle(data, random_state = 42)
data

Unnamed: 0,label,text
3282,drug reaction,monthly cycle changed ive unexpected vaginal d...
315,Typhoid,ive dealing substantial bloating constipation ...
2756,urinary tract infection,getting blood pee sometimes get nauseous peein...
603,Impetigo,rash around nose expansive ruddy bruise taken ...
879,Dengue,ive headache muscular ache aching muscle get w...
...,...,...
1136,Common Cold,sinus feel stuffy eye quite red simply lack en...
1302,Pneumonia,im trouble breathing quite uneasy throat fille...
865,Dengue,day quite challenging due significant joint pa...
3597,diabetes,find difficult breathe especially outside heat...


In [22]:
# Charactieristics of the data
info = data.describe().round()
info

Unnamed: 0,label,text
count,3510,3510
unique,24,3508
top,drug reaction,awoke morning see horrible rash skin several b...
freq,150,2


# **Splitting the Dataset into Train set and Test set**

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

# **Text Feature Extraction**

In [24]:
# Text feature extraction using TF-IDF vectorizer to transform text data
tfidf_vectorizer = TfidfVectorizer(max_features=2400)

# Transforming training and testing data
X_train = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test = tfidf_vectorizer.transform(X_test).toarray()

In [25]:
def tfidf_vectorize_text(text_data, max_features=2400):
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(text_data)
    return tfidf_matrix



---



###############################################
# ***Machine Learning Models***

###############################################

# **Decision Tree Classifier**


## **Applying Grid Search to find the best model version and the best hyperparameters**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Create a Decision Tree Classifier object
dt_classifier = DecisionTreeClassifier(random_state = 42)

# Define the hyperparameters and their possible values to search
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features' : ['sqrt', 'log2', None]
}

# Create the Grid Search object
grid_search = GridSearchCV(estimator = dt_classifier,
                           param_grid = parameters,
                           cv = 5,
                           scoring = 'accuracy',
                           n_jobs = -1)

# Fit the Grid Search to the train data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters found
best_hyperparameters = grid_search.best_params_
print("Best Hyperparameters:", best_hyperparameters)

# Get the best model version
best_dt_classifier = grid_search.best_estimator_
print(best_dt_classifier)

# Print the best accuracy found
best_accuracy = grid_search.best_score_
print(f'Best Accuracy: {best_accuracy*100:.2f} %')


Best Hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
DecisionTreeClassifier(min_samples_split=10, random_state=42)
Best Accuracy: 87.39 %


## **Model Evaluation**

In [None]:
# Calculate and Compare the Score of train data and test data

train_score = best_dt_classifier.score(X_train, y_train)
test_score = best_dt_classifier.score(X_test, y_test)

# Print the scores
print(f'Training Score: {train_score*100:.2f} %')
print(f'Testing Score: {test_score*100:.2f} %')


Training Score: 0.98 %
Testing Score: 0.89 %


In [None]:
# Make Predictions on the train data and test data

train_predictions = best_dt_classifier.predict(X_train)
test_predictions = best_dt_classifier.predict(X_test)

In [None]:
# Calculate and Compare the Accuracy for training and testing data
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

# Print the accuracies
print(f'Training Accuracy: {train_accuracy*100:.2f} %')
print(f'Testing Accuracy: {test_accuracy*100:.2f} %')


Training Accuracy: 0.98 %
Testing Accuracy: 0.89 %


In [None]:
# Make the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm_1 = confusion_matrix(y_test, test_predictions)

# Print the Confusion Matrix
print(cm_1)

[[36  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 39  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 29  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0 34  1  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 25  0  6  0  1  0  2  0  0  0  0  2  1  0  0  4  0  0  0  0]
 [ 0  0  0  0  0 33  0  0  0  0  0  0  0  0  3  1  0  0  1  0  0  0  0  0]
 [ 0  1  0  2  8  0 26  0  0  0  0  0  1  1  0  0  0  0  2  1  0  1  0  0]
 [ 0  0  0  0  0  0  1 35  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2]
 [ 0  0  0  0  0  0  1  0 38  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 29  0  0  0  0  0  0  1  0  0  0  2  0  0  0]
 [ 0  0  0  0  2  0  0  0  0  0 32  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 33  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  2  0  0  0  0  0 29  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  2  0  0  0

## **Model Validation**

In [None]:
# Validation Test #

# text_before = "The skin around my mouth, nose, and eyes is ruddy and kindled. It is regularly bothersome and awkward. There's a recognizable aggravation in my nails."

text_before = "The abdominal pain has been coming and going, and it's been really unpleasant. It's been accompanied by constipation and vomiting. I feel really concerned about my health."

# Cleaning
text_after = preprocess_text(text_before)

print(text_before)
print(text_after)

# Vectorization
tfidf_vectorizer

text_after = tfidf_vectorizer.transform([text_after]).toarray()

print(text_after.reshape(-1,1))

# Prediction
test_predictions = best_dt_classifier.predict(text_after)

print(test_predictions)

The abdominal pain has been coming and going, and it's been really unpleasant. It's been accompanied by constipation and vomiting. I feel really concerned about my health.
abdominal pain coming going really unpleasant accompanied constipation vomiting feel really concerned health
[[0.        ]
 [0.30318867]
 [0.        ]
 ...
 [0.        ]
 [0.        ]
 [0.        ]]
['Typhoid']


# **Random Forest Classifier**

Random Forest is an ensemble learning algorithm,
It works by constructing multiple decision trees.

## **Applying Grid Search to find the best model version and the best hyperparameters**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a Random Forest Classifier object
rf_classifier = RandomForestClassifier(random_state = 42)

# Define the hyperparameters and their possible values to search
parameters = {
    'n_estimators': [10, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features' : ['sqrt', 'log2', None]
}

# Create the Grid Search object
grid_search = GridSearchCV(estimator = rf_classifier,
                           param_grid = parameters,
                           cv = 5,
                           scoring = 'accuracy',
                           n_jobs = -1)

# Fit the Grid Search to the train data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters found
best_hyperparameters = grid_search.best_params_
print("Best Hyperparameters:", best_hyperparameters)

# Get the best model version
best_rf_classifier = grid_search.best_estimator_
print(best_rf_classifier)

# Get the best accuracy found
best_accuracy = grid_search.best_score_
print(f'Best Accuracy: {best_accuracy*100:.2f} %')


Best Hyperparameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
RandomForestClassifier(criterion='entropy', max_features='log2',
                       min_samples_split=5, random_state=42)
Best Accuracy: 97.99 %


## **Model Evaluation**

In [None]:
# Calculate and Compare the Score of train data and test data

train_score = best_rf_classifier.score(X_train, y_train)
test_score = best_rf_classifier.score(X_test, y_test)

# Print the scores
print(f'Training Score: {train_score*100:.2f} %')
print(f'Testing Score: {test_score*100:.2f} %')


Training Score: 1.00 %
Testing Score: 0.98 %


In [None]:
# Make Predictions on the train data and test data

train_predictions = best_rf_classifier.predict(X_train)
test_predictions = best_rf_classifier.predict(X_test)

In [None]:
# Calculate and Compare the Accuracy for training and testing data
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

# Print the accuracies
print(f'Training Accuracy: {train_accuracy*100:.2f} %')
print(f'Testing Accuracy: {test_accuracy*100:.2f} %')


Training Accuracy: 1.00 %
Testing Accuracy: 0.98 %


In [None]:
# Make the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm_2 = confusion_matrix(y_test, test_predictions)

# Print the Confusion Matrix
print(cm_2)

[[36  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 39  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 31  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 36  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 33  0  3  0  0  0  2  0  0  0  0  2  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 38  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  2  0 40  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 38  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 40  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 32  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 34  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 34  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 32  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0

## **Model Validation**

In [None]:
# Validation Test #

# text_before = "The skin around my mouth, nose, and eyes is ruddy and kindled. It is regularly bothersome and awkward. There's a recognizable aggravation in my nails."

text_before = "The abdominal pain has been coming and going, and it's been really unpleasant. It's been accompanied by constipation and vomiting. I feel really concerned about my health."

# Cleaning
text_after = preprocess_text(text_before)

print(text_before)
print(text_after)

# Vectorization
tfidf_vectorizer

text_after = tfidf_vectorizer.transform([text_after]).toarray()

print(text_after.reshape(-1,1))

# Prediction
test_predictions = best_rf_classifier.predict(text_after)

print(test_predictions)

The abdominal pain has been coming and going, and it's been really unpleasant. It's been accompanied by constipation and vomiting. I feel really concerned about my health.
abdominal pain coming going really unpleasant accompanied constipation vomiting feel really concerned health
[[0.        ]
 [0.30318867]
 [0.        ]
 ...
 [0.        ]
 [0.        ]
 [0.        ]]
['Typhoid']


# **Support Vector Machine (SVM)**

Its primary purpose is to find a hyperplane that best separates data points into different classes.

## **Applying Grid Search to find the best model version and the best hyperparameters**

In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Create a Support Vector Machine Classifier object
svm_classifier = SVC(random_state = 42)

# Define the hyperparameters and their possible values to search
parameters = [{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['linear']},
              {'C': [0.25, 0.5, 0.75, 1],'kernel': ['rbf'], 'gamma': ['scale', 'auto', 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}]

# Create the Grid Search object
grid_search = GridSearchCV(estimator = svm_classifier,
                           param_grid = parameters,
                           cv = 5,
                           scoring = 'accuracy',
                           n_jobs = -1)

# Fit the Grid Search to the train data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters found
best_hyperparameters = grid_search.best_params_
print("Best Hyperparameters:", best_hyperparameters)

# Get the best model version
best_svm_classifier = grid_search.best_estimator_
print(best_svm_classifier)

# Print the best accuracy found
best_accuracy = grid_search.best_score_
print(f'Best Accuracy: {best_accuracy*100:.2f} %')


NameError: name 'X_train' is not defined

In [4]:
# Download pretrained Model
import joblib
joblib.dump(best_svm_classifier, "model_SVM.pkl")

NameError: name 'best_svm_classifier' is not defined

In [6]:
# Load pretrained Model
loaded_model = joblib.load("model_SVM.pkl")
loaded_model

## **Model Evaluation**

In [27]:
# Calculate and Compare the Score of train data and test data

train_score = best_svm_classifier.score(X_train, y_train)
test_score = best_svm_classifier.score(X_test, y_test)

# Print the scores
print(f'Training Score: {train_score*100:.2f} %')
print(f'Testing Score: {test_score*100:.2f} %')


Training Score: 100.00 %
Testing Score: 99.43 %


In [28]:
# Make Predictions on the train data and test data

train_predictions = best_svm_classifier.predict(X_train)
test_predictions = best_svm_classifier.predict(X_test)

In [29]:
# Calculate and Compare the Accuracy for training and testing data
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

# Print the accuracies
print(f'Training Accuracy: {train_accuracy*100:.2f} %')
print(f'Testing Accuracy: {test_accuracy*100:.2f} %')


Training Accuracy: 100.00 %
Testing Accuracy: 99.43 %


In [30]:
# Make the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm_3 = confusion_matrix(y_test, test_predictions)

# Print the Confusion Matrix
print(cm_3)

[[36  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 39  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 31  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 36  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 38  0  1  0  0  0  0  0  0  0  0  1  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 38  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0 42  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 38  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 40  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 32  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 34  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 34  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 32  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0

## **Model Validation**

In [7]:
# Validation Test #

# text_before = "The skin around my mouth, nose, and eyes is ruddy and kindled. It is regularly bothersome and awkward. There's a recognizable aggravation in my nails."

text_before = "wdwwdsvwsdfjwem;knl,,cod"

# Cleaning
text_after = preprocess_text(text_before)

print(text_before)
print(text_after)

# Vectorization
tfidf_vectorizer

text_after = tfidf_vectorizer.transform([text_after]).toarray()

print(text_after.reshape(-1,1))

# Prediction
test_predictions = best_svm_classifier.predict(text_after)

print(test_predictions)

NameError: name 'preprocess_text' is not defined

# **KNeighborsClassifier**
In the k-NN algorithm, the "k" represents the number of nearest neighbors considered for making predictions.


## **Applying Grid Search to find the best model version and the best hyperparameters**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Create a K-Neighbors Classifier object
knn_classifier = KNeighborsClassifier()

# Define the hyperparameters and their possible values to search
parameters = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

# Create the Grid Search object
grid_search = GridSearchCV(estimator = knn_classifier,
                           param_grid = parameters,
                           cv = 5,
                           scoring = 'accuracy',
                           n_jobs = -1)

# Fit the Grid Search to the train data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters found
best_hyperparameters = grid_search.best_params_
print("Best Hyperparameters:", best_hyperparameters)

# Get the best model version
best_knn_classifier = grid_search.best_estimator_
print(best_knn_classifier)

# Print the best accuracy found
best_accuracy = grid_search.best_score_
print(f'Best Accuracy: {best_accuracy*100:.2f} %')


Best Hyperparameters: {'algorithm': 'auto', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
KNeighborsClassifier(n_neighbors=3, weights='distance')
Best Accuracy: 97.83 %


 0.96542948 0.9737873         nan 0.96884578 0.96124554 0.96960479
        nan 0.96960623 0.95630833 0.96542228 0.96010917 0.96846843
 0.97074404 0.9783414  0.96922745 0.97188616 0.96542948 0.9737873
 0.96314668 0.96884578 0.96124554 0.96960479 0.96086531 0.96960623
 0.95630833 0.96542228 0.96010917 0.96846843 0.97074404 0.9783414
 0.96922745 0.97188616 0.96542948 0.9737873  0.96314668 0.96884578
 0.96124554 0.96960479 0.96086531 0.96960623 0.95630833 0.96542228
        nan 0.96846843 0.97074404 0.9783414         nan 0.97188616
 0.96542948 0.9737873         nan 0.96884578 0.96124554 0.96960479
        nan 0.96960623 0.95630833 0.96542228]


## **Model Evaluation**

In [None]:
# Calculate and Compare the Score of train data and test data

train_score = best_knn_classifier.score(X_train, y_train)
test_score = best_knn_classifier.score(X_test, y_test)

# Print the scores
print(f'Training Score: {train_score*100:.2f} %')
print(f'Testing Score: {test_score*100:.2f} %')


Training Score: 1.00 %
Testing Score: 0.98 %


In [None]:
# Make Predictions on the train data and test data

train_predictions = best_knn_classifier.predict(X_train)
test_predictions = best_knn_classifier.predict(X_test)

In [None]:
# Calculate and Compare the Accuracy for training and testing data
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

# Print the accuracies
print(f'Training Accuracy: {train_accuracy*100:.2f} %')
print(f'Testing Accuracy: {test_accuracy*100:.2f} %')


Training Accuracy: 1.00 %
Testing Accuracy: 0.98 %


In [None]:
# Make the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm_5 = confusion_matrix(y_test, test_predictions)

# Print the Confusion Matrix
print(cm_5)

[[36  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 39  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 31  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 36  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 35  0  2  0  1  0  1  0  0  0  0  1  1  0  0  0  0  0  0  0]
 [ 0  0  1  0  0 37  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  2  0 38  0  0  0  0  0  0  0  0  0  0  3  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 38  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 40  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0 31  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 34  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 34  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 32  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0

## **Model Validation**

In [None]:
# Validation Test #

# text_before = "The skin around my mouth, nose, and eyes is ruddy and kindled. It is regularly bothersome and awkward. There's a recognizable aggravation in my nails."

text_before = "The abdominal pain has been coming and going, and it's been really unpleasant. It's been accompanied by constipation and vomiting. I feel really concerned about my health."

# Cleaning
text_after = preprocess_text(text_before)

print(text_before)
print(text_after)

# Vectorization
tfidf_vectorizer

text_after = tfidf_vectorizer.transform([text_after]).toarray()

print(text_after.reshape(-1,1))

# Prediction
test_predictions = best_knn_classifier.predict(text_after)

print(test_predictions)

The abdominal pain has been coming and going, and it's been really unpleasant. It's been accompanied by constipation and vomiting. I feel really concerned about my health.
abdominal pain coming going really unpleasant accompanied constipation vomiting feel really concerned health
[[0.        ]
 [0.30318867]
 [0.        ]
 ...
 [0.        ]
 [0.        ]
 [0.        ]]
['Typhoid']
