In [1]:
import torch

In [28]:
from torch.utils.data import Dataset

In [2]:
import json
import numpy as np
import pandas as pd
import random
from wordcloud import WordCloud,STOPWORDS
import missingno as msno

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [4]:
from keras.preprocessing import text
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer

In [6]:

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline
from transformers import DistilBertTokenizerFast
from transformers import BertForSequenceClassification, BertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, TFTrainingArguments
from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig
from transformers import TrainingArguments, Trainer


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def load_json_file(filename):
    with open(filename) as f:
        file = json.load(f)
    return file

filename = 'intents.json'

intents = load_json_file(filename)

In [8]:
def create_df():
    df = pd.DataFrame({
        'Pattern' : [],
        'Tag' : []
    })
    
    return df

df = create_df()
df

Unnamed: 0,Pattern,Tag


In [9]:
def extract_json_info(json_file, df):
    
    for intent in json_file['intents']:
        
        for pattern in intent['patterns']:
            
            sentence_tag = [pattern, intent['tag']]
            df.loc[len(df.index)] = sentence_tag
                
    return df

df = extract_json_info(intents, df)
df.head()

Unnamed: 0,Pattern,Tag
0,Hello,greeting
1,Hi,greeting
2,Hey,greeting
3,Good morning,greeting
4,Good evening,greeting


In [10]:
df2 = df.copy()
df2.head()

Unnamed: 0,Pattern,Tag
0,Hello,greeting
1,Hi,greeting
2,Hey,greeting
3,Good morning,greeting
4,Good evening,greeting


In [11]:
def print_shape_df(df, ds_name="df"):
    print(f"{ds_name} dataset has {df.shape[0]} rows and {df.shape[1]} columns")
    
print_shape_df(df, "Chatbot")

Chatbot dataset has 81 rows and 2 columns


In [12]:
def print_dfInfo(df, ds_name="df"):
    print(f"The info of {ds_name} dataset\n")
    print(df.info())
    
print_dfInfo(df, "Chatbot")

The info of Chatbot dataset

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 0 to 80
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Pattern  81 non-null     object
 1   Tag      81 non-null     object
dtypes: object(2)
memory usage: 1.9+ KB
None


In [13]:
def num_classes(df, target_col, ds_name="df"):
    print(f"The {ds_name} dataset has {len(df[target_col].unique())} classes")
    
num_classes(df, 'Tag', "Chatbot")

The Chatbot dataset has 21 classes


In [14]:
def check_null(df, ds_name='df'):
    print(f"Null Values in each col in the {ds_name} dataset:\n")
    print(df.isnull().sum())
    
check_null(df, "Chatbot")

Null Values in each col in the Chatbot dataset:

Pattern    0
Tag        0
dtype: int64


In [15]:
df2.head()

Unnamed: 0,Pattern,Tag
0,Hello,greeting
1,Hi,greeting
2,Hey,greeting
3,Good morning,greeting
4,Good evening,greeting


In [16]:
labels = df2['Tag'].unique().tolist()
labels = [s.strip() for s in labels]
labels

['greeting',
 'goodbye',
 'thanks',
 'help',
 'admission_info',
 'academic_calendar',
 'contact_info',
 'important_dates',
 'facility_info',
 'latest_news',
 'locate_results',
 'locate_admissions',
 'locate_academic_calendar',
 'locate_notices',
 'locate_contact_info',
 'notice_board',
 'job_openings',
 'tender_notices',
 'events',
 'student_welfare',
 'vision_mission']

In [17]:
num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label:id for id, label in enumerate(labels)}

In [18]:
id2label

{0: 'greeting',
 1: 'goodbye',
 2: 'thanks',
 3: 'help',
 4: 'admission_info',
 5: 'academic_calendar',
 6: 'contact_info',
 7: 'important_dates',
 8: 'facility_info',
 9: 'latest_news',
 10: 'locate_results',
 11: 'locate_admissions',
 12: 'locate_academic_calendar',
 13: 'locate_notices',
 14: 'locate_contact_info',
 15: 'notice_board',
 16: 'job_openings',
 17: 'tender_notices',
 18: 'events',
 19: 'student_welfare',
 20: 'vision_mission'}

In [19]:
label2id

{'greeting': 0,
 'goodbye': 1,
 'thanks': 2,
 'help': 3,
 'admission_info': 4,
 'academic_calendar': 5,
 'contact_info': 6,
 'important_dates': 7,
 'facility_info': 8,
 'latest_news': 9,
 'locate_results': 10,
 'locate_admissions': 11,
 'locate_academic_calendar': 12,
 'locate_notices': 13,
 'locate_contact_info': 14,
 'notice_board': 15,
 'job_openings': 16,
 'tender_notices': 17,
 'events': 18,
 'student_welfare': 19,
 'vision_mission': 20}

In [20]:
df2['labels'] = df2['Tag'].map(lambda x: label2id[x.strip()])
df2.head()

Unnamed: 0,Pattern,Tag,labels
0,Hello,greeting,0
1,Hi,greeting,0
2,Hey,greeting,0
3,Good morning,greeting,0
4,Good evening,greeting,0


In [21]:
X = list(df2['Pattern'])
X[:5]

['Hello', 'Hi', 'Hey', 'Good morning', 'Good evening']

In [22]:
y = list(df2['labels'])
y[:5]

[0, 0, 0, 0, 0]

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 123)

In [24]:
model_name = "bert-base-uncased"
max_len = 256

tokenizer = BertTokenizer.from_pretrained(model_name, 
                                          max_length=max_len)

model = BertForSequenceClassification.from_pretrained(model_name, 
                                                      num_labels=num_labels, 
                                                      id2label=id2label, 
                                                      label2id = label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
train_encoding = tokenizer(X_train, truncation=True, padding=True)
test_encoding = tokenizer(X_test, truncation=True, padding=True)

In [26]:
full_data = tokenizer(X, truncation=True, padding=True)

In [29]:
class DataLoader(Dataset):
    
    def __init__(self, encodings, labels):
        
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
               
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):

        return len(self.labels)

In [30]:
train_dataloader = DataLoader(train_encoding, y_train)
test_dataloader = DataLoader(test_encoding, y_test)

In [31]:
fullDataLoader = DataLoader(full_data, y_test)

In [32]:
def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [33]:
training_args = TrainingArguments(
    output_dir='./output', 
    do_train=True,
    do_eval=True,
    num_train_epochs=100,              
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=16,
    warmup_steps=100,                
    weight_decay=0.05,
    logging_strategy='steps',
    logging_dir='./multi-class-logs',            
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps", 
    load_best_model_at_end=True
)

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,                 
    train_dataset=train_dataloader,         
    eval_dataset=test_dataloader,            
    compute_metrics= compute_metrics
)

In [35]:
trainer.train()

 25%|██▌       | 50/200 [00:05<00:16,  9.06it/s]

{'loss': 2.8276, 'learning_rate': 2.5e-05, 'epoch': 25.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                
 26%|██▌       | 51/200 [00:05<00:18,  7.94it/s]

{'eval_loss': 2.952202320098877, 'eval_Accuracy': 0.23809523809523808, 'eval_F1': 0.18055555555555555, 'eval_Precision': 0.17592592592592593, 'eval_Recall': 0.19444444444444445, 'eval_runtime': 0.034, 'eval_samples_per_second': 618.364, 'eval_steps_per_second': 58.892, 'epoch': 25.0}


 50%|█████     | 100/200 [00:11<00:11,  8.88it/s]

{'loss': 1.3507, 'learning_rate': 5e-05, 'epoch': 50.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                 
 51%|█████     | 102/200 [00:11<00:11,  8.30it/s]

{'eval_loss': 2.2029531002044678, 'eval_Accuracy': 0.42857142857142855, 'eval_F1': 0.3407407407407408, 'eval_Precision': 0.34259259259259256, 'eval_Recall': 0.3611111111111111, 'eval_runtime': 0.0347, 'eval_samples_per_second': 604.503, 'eval_steps_per_second': 57.572, 'epoch': 50.0}


 75%|███████▌  | 150/200 [00:17<00:05,  9.18it/s]

{'loss': 0.1381, 'learning_rate': 2.5e-05, 'epoch': 75.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                 
 76%|███████▌  | 151/200 [00:17<00:06,  8.16it/s]

{'eval_loss': 2.567797899246216, 'eval_Accuracy': 0.38095238095238093, 'eval_F1': 0.3333333333333333, 'eval_Precision': 0.3611111111111111, 'eval_Recall': 0.3333333333333333, 'eval_runtime': 0.0335, 'eval_samples_per_second': 627.581, 'eval_steps_per_second': 59.77, 'epoch': 75.0}


100%|██████████| 200/200 [00:22<00:00,  9.02it/s]

{'loss': 0.0402, 'learning_rate': 0.0, 'epoch': 100.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                 
100%|██████████| 200/200 [00:22<00:00,  8.82it/s]

{'eval_loss': 2.666045665740967, 'eval_Accuracy': 0.38095238095238093, 'eval_F1': 0.3333333333333333, 'eval_Precision': 0.3611111111111111, 'eval_Recall': 0.3333333333333333, 'eval_runtime': 0.0418, 'eval_samples_per_second': 502.564, 'eval_steps_per_second': 47.863, 'epoch': 100.0}
{'train_runtime': 22.6882, 'train_samples_per_second': 264.454, 'train_steps_per_second': 8.815, 'train_loss': 1.0891263234615325, 'epoch': 100.0}





TrainOutput(global_step=200, training_loss=1.0891263234615325, metrics={'train_runtime': 22.6882, 'train_samples_per_second': 264.454, 'train_steps_per_second': 8.815, 'train_loss': 1.0891263234615325, 'epoch': 100.0})

In [36]:
q=[trainer.evaluate(eval_dataset=df2) for df2 in [train_dataloader, test_dataloader]]

pd.DataFrame(q, index=["train","test"]).iloc[:,:5]

100%|██████████| 4/4 [00:00<00:00, 43.49it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 2/2 [00:00<00:00, 164.06it/s]


Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision,eval_Recall
train,0.026907,1.0,1.0,1.0,1.0
test,2.666046,0.380952,0.333333,0.361111,0.333333


In [37]:
def predict(text):
    
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    pred_label_idx = probs.argmax()
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label

In [38]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.__version__)  # To confirm the installed version


True
2.3.1+cu118


In [39]:
text = "Hello"
predict(text)

(tensor([[9.7125e-01, 3.1574e-03, 1.2318e-03, 2.0892e-03, 6.8326e-04, 6.0868e-04,
          1.1441e-03, 1.3019e-04, 1.2453e-03, 1.1416e-03, 2.2186e-03, 1.7078e-03,
          3.4948e-04, 2.3039e-03, 1.2067e-03, 2.6736e-03, 6.2429e-04, 1.3397e-03,
          2.6403e-03, 1.1542e-03, 1.1025e-03]], device='cuda:0',
        grad_fn=<SoftmaxBackward0>),
 tensor(0, device='cuda:0'),
 'greeting')

In [40]:
model_path = "chatbot"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('chatbot\\tokenizer_config.json',
 'chatbot\\special_tokens_map.json',
 'chatbot\\vocab.txt',
 'chatbot\\added_tokens.json')

In [42]:
model_path = "chatbot"


model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer= BertTokenizerFast.from_pretrained(model_path)
chatbot= pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [43]:
chatbot("Hello")

[{'label': 'greeting', 'score': 0.9712475538253784}]

In [44]:
def chat(chatbot):
    
    print("Chatbot: Hi! I am your virtual assistance,Feel free to ask, and I'll do my best to provide you with answers and assistance..")
    print("Type 'quit' to exit the chat\n\n")
    
    text = input("User: ").strip().lower()
    
    while(text != 'quit'):

        score = chatbot(text)[0]['score']
        
        if score < 0.8:
            print("Chatbot: Sorry I can't answer that\n\n")
            text = input("User: ").strip().lower()
            continue
        
        label = label2id[chatbot(text)[0]['label']]
        response = random.choice(intents['intents'][label]['responses'])
        
        print(f"Chatbot: {response}\n\n")
            
        text = input("User: ").strip().lower()

In [45]:
chat(chatbot)


Chatbot: Hi! I am your virtual assistance,Feel free to ask, and I'll do my best to provide you with answers and assistance..
Type 'quit' to exit the chat


