In [58]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint

In [59]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Example using BERT


import json 

In [60]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [61]:
# confirm that the "petsitly_marketing" database is in MongoDB
print(mongo.list_database_names())

['FAQ', 'admin', 'config', 'covid_db', 'eye_deseases', 'fruits_db', 'local', 'metrolinx', 'petsitly_marketing']


In [62]:
# assign the database to a variable name
db = mongo['FAQ']

In [63]:
# review the collections in our new database
print(db.list_collection_names())

['questions']


In [64]:
# Access the collection
questions_collection = db['questions']

# Find all documents in the collection
documents = questions_collection.find()

# Initialize lists to store texts and labels
texts = []
labels = []
answers=[]


# Iterate through each document
for document in documents:
    # Access the 'questions' array
    questions_array = document['questions']
    
    # Iterate through each question in the array
    for question_info in questions_array:
        # Access the 'question' field
        question_text = question_info['question']
        answers_text = question_info['answer']
       
        # Append the question to the 'texts' list if it's not already there
        if question_text not in texts:
            texts.append(question_text)
            answers.append(answers_text)
            # Append the corresponding label (index) to the 'labels' list
            labels.append(len(texts) - 1)

In [94]:
for i in range(10,20,2):  # Print the first 10 labels and corresponding texts
    print(f"Label: {labels[i]}, Text: {texts[i]}, Answers:{answers[i]}")

Label: 10, Text: What are the hospital's operating hours for inquiries and appointments?, Answers:The specific operating hours of the hospital for inquiries and appointments may vary. It is advisable to contact the hospital during their working hours, typically from 9:00 am to 3:00 pm (Monday to Friday) and 9:00 am to 1:00 pm (Saturday).
Label: 12, Text: Is the hospital reachable via public transportation, and are there any landmarks nearby?, Answers:Yes, the hospital is reachable via public transportation such as buses, taxis, and other modes of travel. The hospital's location at 157/F Nilgunj Road, Panihati, Sodepur, Kolkata, can serve as a reference point for reaching the hospital.
Label: 14, Text: Can I receive directions to the hospital using GPS or navigation apps?, Answers:Yes, you can use GPS or navigation apps to receive directions to the hospital's address: 157/F Nilgunj Road, Panihati, Sodepur, Kolkata, Pin: 700114, West Bengal.
Label: 16, Text: How can I inquire about speci

# PREPROCESSING

In [66]:
from transformers import DistilBertTokenizer

model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)


In [67]:
!pip install torch torchvision torchaudio



In [68]:
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
# labels = torch.tensor(labels)

In [69]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': torch.tensor(self.labels[idx])
        }

# Create dataset and dataloader
dataset = SentimentDataset(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    labels=labels
)

In [70]:
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

PREPARE MODEL

In [71]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels = len(labels))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

TRAIN

In [73]:
from tqdm import tqdm

num_epochs = 12

# Assuming you have a separate dataloader for validation data named 'val_dataloader'
# and a model named 'model'

for epoch in range(num_epochs):
    total_loss = 0
    total_correct = 0
    total_samples = 0

    # Training loop
    progress_bar = tqdm(enumerate(dataloader, 1), total=len(dataloader))
    for step, batch in progress_bar:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = criterion(outputs.logits, batch['labels'])
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        predictions = torch.argmax(outputs.logits, dim=1)
        correct = (predictions == batch['labels']).sum().item()
        total_correct += correct
        total_samples += len(batch['labels'])

        # Accumulate loss
        total_loss += loss.item()

        progress_bar.set_description(f'Epoch {epoch+1}/{num_epochs}, Step {step}/{len(dataloader)}')
        progress_bar.set_postfix({'Loss': loss.item(), 'Accuracy': correct / len(batch['labels'])})

    # Calculate training statistics for the epoch
    epoch_loss = total_loss / len(dataloader)
    epoch_accuracy = total_correct / total_samples

    # Validation loop
    model.eval()
    val_total_loss = 0
    val_total_correct = 0
    val_total_samples = 0

    with torch.no_grad():
        for val_step, val_batch in enumerate(dataloader, 1):
            val_outputs = model(**val_batch)
            val_loss = criterion(val_outputs.logits, val_batch['labels'])

            # Calculate accuracy
            val_predictions = torch.argmax(val_outputs.logits, dim=1)
            val_correct = (val_predictions == val_batch['labels']).sum().item()
            val_total_correct += val_correct
            val_total_samples += len(val_batch['labels'])

            # Accumulate loss
            val_total_loss += val_loss.item()

    # Calculate validation statistics for the epoch
    val_epoch_loss = val_total_loss / len(dataloader)
    val_epoch_accuracy = val_total_correct / val_total_samples

    # Print statistics for the epoch
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, Val Loss: {val_epoch_loss:.4f}, Val Accuracy: {val_epoch_accuracy:.4f}')

Epoch 1/12, Step 27/27: 100%|█| 27/27 [01:40<00:00,  3.73s/it, Loss=6.1, Accurac


Epoch 1/12, Loss: 6.0769, Accuracy: 0.0023, Val Loss: 6.0554, Val Accuracy: 0.0023


Epoch 2/12, Step 27/27: 100%|█| 27/27 [01:46<00:00,  3.94s/it, Loss=6.04, Accura


Epoch 2/12, Loss: 6.0543, Accuracy: 0.0047, Val Loss: 6.0329, Val Accuracy: 0.0116


Epoch 3/12, Step 27/27: 100%|█| 27/27 [01:50<00:00,  4.10s/it, Loss=6.02, Accura


Epoch 3/12, Loss: 6.0278, Accuracy: 0.0116, Val Loss: 5.9813, Val Accuracy: 0.0349


Epoch 4/12, Step 27/27: 100%|█| 27/27 [01:52<00:00,  4.17s/it, Loss=5.99, Accura


Epoch 4/12, Loss: 5.9689, Accuracy: 0.0279, Val Loss: 5.8904, Val Accuracy: 0.0884


Epoch 5/12, Step 27/27: 100%|█| 27/27 [01:51<00:00,  4.12s/it, Loss=5.88, Accura


Epoch 5/12, Loss: 5.8719, Accuracy: 0.0744, Val Loss: 5.7772, Val Accuracy: 0.2186


Epoch 6/12, Step 27/27: 100%|█| 27/27 [01:41<00:00,  3.76s/it, Loss=5.81, Accura


Epoch 6/12, Loss: 5.7586, Accuracy: 0.2070, Val Loss: 5.6544, Val Accuracy: 0.3930


Epoch 7/12, Step 27/27: 100%|█| 27/27 [01:41<00:00,  3.76s/it, Loss=5.61, Accura


Epoch 7/12, Loss: 5.6344, Accuracy: 0.4000, Val Loss: 5.5295, Val Accuracy: 0.5953


Epoch 8/12, Step 27/27: 100%|█| 27/27 [01:41<00:00,  3.77s/it, Loss=5.44, Accura


Epoch 8/12, Loss: 5.5098, Accuracy: 0.5744, Val Loss: 5.4039, Val Accuracy: 0.7535


Epoch 9/12, Step 27/27: 100%|█| 27/27 [01:41<00:00,  3.76s/it, Loss=5.44, Accura


Epoch 9/12, Loss: 5.3857, Accuracy: 0.7256, Val Loss: 5.2781, Val Accuracy: 0.8442


Epoch 10/12, Step 27/27: 100%|█| 27/27 [01:44<00:00,  3.86s/it, Loss=5.26, Accur


Epoch 10/12, Loss: 5.2598, Accuracy: 0.8442, Val Loss: 5.1533, Val Accuracy: 0.9047


Epoch 11/12, Step 27/27: 100%|█| 27/27 [01:39<00:00,  3.70s/it, Loss=5.17, Accur


Epoch 11/12, Loss: 5.1370, Accuracy: 0.9023, Val Loss: 5.0332, Val Accuracy: 0.9326


Epoch 12/12, Step 27/27: 100%|█| 27/27 [01:42<00:00,  3.78s/it, Loss=4.98, Accur


Epoch 12/12, Loss: 5.0175, Accuracy: 0.9279, Val Loss: 4.9147, Val Accuracy: 0.9581


PREDICT

In [95]:
def get_response(input_text):
    # Preprocess input text
    inputs = tokenizer(input_text, return_tensors="pt")
    
    # Step 5: Inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    # Convert logits to probabilities and get predicted label
    probabilities = torch.softmax(logits, dim=-1)
    predicted_label = torch.argmax(probabilities, dim=-1).item()
    
    # Access the collection
    questions_collection = db['questions']

      # Find all documents in the collection
    documents = questions_collection.find()

      # Initialize response variable
    response = None

      # Iterate through each document and print the structure
    for document in documents:
        # Access the 'questions' array
        questions_array = document['questions']

        # Retrieve corresponding answer based on predicted label
        response = questions_array[predicted_label]['answer']

        # Now you can use the 'response' variable as needed in your code
        print("Response:", response)

        return response

In [96]:
text_for_pred = "Are the hospital's operating hours for inquiries and appointments the same?"
predict = get_response(text_for_pred)

Response: The specific operating hours of the hospital for inquiries and appointments may vary. It is advisable to contact the hospital during their working hours, typically from 9:00 am to 3:00 pm (Monday to Friday) and 9:00 am to 1:00 pm (Saturday).
