# 1. Installing & Importing Python Libraries

In [None]:
!pip3 install ktrain

In [None]:
#Data preprocessing libraries
import re
import string
import pandas
import os
import numpy as np
import pandas as pd
from collections import Counter
from sklearn import preprocessing

#Data visualization libraries
import seaborn as sns
import matplotlib.cm as cm
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from matplotlib import rcParams

#Natural language libraries for text
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import ktrain
from ktrain import text

#Avoid system warning library
import warnings
warnings.filterwarnings("ignore")

# 2. Loading data

In [None]:
data = pd.read_excel('Copy Path of Binay-class or Multi-class data.xlsx')

# 3. Exploratory data analysis

### 3.1. Five top records of data

In [None]:
data.head()

### 3.2. Five last records of data

In [None]:
data.tail()

### 3.3. Coloumns/features in data

In [None]:
data.columns

### 3.4. Length of data

In [None]:
print('lenght of data is', len(data))

### 3.5. Shape of data

In [None]:
data.shape

### 3.6. Data information

In [None]:
data.info()

### 3.7. Data types of all coloumns

In [None]:
data.dtypes

### 3.8. Checking Null Values

In [None]:
data[data.isnull().any(axis=1)].head()

### 3.9. Count of Null values

In [None]:
np.sum(data.isnull().any(axis=1))

### 3.10. Count of all values in Label

In [None]:
y=data['condition']
y.value_counts()

### 3.11. Class  distribution

In [None]:
data["condition"].value_counts().head(12).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

In [None]:
sns.countplot(data= data, x = "condition")
plt.show()

# 4. Getting Tokenization

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
data["review"] = data["Review"].apply(tokenizer.tokenize)

In [None]:
data["review"].head()

In [None]:
data['review']=data['Review']
data

In [None]:
data = data.drop(columns=["review"])
data

# 5. Description of text information

In [None]:
all_words = [word for tokens in data["Review"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in data["Review"]]

VOCAB = sorted(list(set(all_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

# 6. Class Labels Encoding

In [None]:
data=data.rename(columns={'condition':'Class'})

### 6.1. Encoding **Birth Control** as 0

In [None]:
data['Class'] = data['Class'].replace({"Birth Control" : 0})

### 6.2. Encoding **Depression** as 1

In [None]:
data['Class'] = data['Class'].replace({"Depression": 1})

### 6.3. Encoding **Pain** as 2

In [None]:
data['Class'] = data['Class'].replace({"Pain": 2})

### 6.4. Encoding **Anxiety** as 3

In [None]:
data['Class'] = data['Class'].replace({"Anxiety": 3})

### 6.5. Encoding **Acne** as 4

In [None]:
data['Class'] = data['Class'].replace({"Acne": 4})

### 6.6. Encoding **Bipolar Disorder** as 5

In [None]:
data['Class'] = data['Class'].replace({"Bipolar Disorder": 5})

### 6.7. Encoding **Insomnia** as 6

In [None]:
data['Class'] = data['Class'].replace({"Insomnia": 6})

### 6.8. Encoding **Weight Loss** as 7

In [None]:
data['Class'] = data['Class'].replace({"Weight Loss": 7})

### 6.9. Encoding **Obesity** as 8

In [None]:
data['Class'] = data['Class'].replace({"Obesity": 8})

### 6.10. Encoding **ADHD** as 9

In [None]:
data['Class'] = data['Class'].replace({"ADHD": 9})

### 6.11. Encoding **Bowel Preparation** as 10

In [None]:
data['Class'] = data['Class'].replace({"Bowel Preparation": 10})

### 6.12. Encoding **Emergency Contraception** as 11


In [None]:
data['Class'] = data['Class'].replace({"Emergency Contraception": 11})

### 6.13. Ecoded Classes Count

In [None]:
data['Class'].value_counts()

# 7. Training BERT model

In [None]:
data['Review'] = data['Review'].apply(lambda m: " ".join(m))

In [None]:
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(data,
                                                                    'Review',
                                                                   preprocess_mode='bert',
                                                                    label_columns=['Class'])

In [None]:
model = text.text_classifier(name='bert',
                             train_data=(x_train, y_train),
                             preproc=preproc)

In [None]:
Bert_Model = ktrain.get_learner(model=model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=4)

In [None]:
Bert_Model.fit_onecycle(lr=2e-5,
                     epochs=4)

# 8. Evaluation matrix (Accuracy, Precision, Recall and F1 Measures)

### 8.1. Binay Class

In [None]:
Bert_Model.validate(class_names=[0,1])

### 8.2. Multi Class

In [None]:
Bert_Model.validate(class_names=[0,1,2,3,4,5,6,7,8,9,10,11])