In [10]:
!pip install pandas nltk torch transformers scikit-learn requests

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [11]:
import pandas as pd
import numpy as np
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import requests
import json

nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [13]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/intern_screening_dataset.csv')

In [14]:
# Clean and preprocess the data
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data['question'] = data['question'].str.lower().str.replace('[^\w\s]', '', regex=True)
data['answer'] = data['answer'].str.lower().str.replace('[^\w\s]', '', regex=True)

In [15]:
# Tokenization and Lemmatization
lemmatizer = WordNetLemmatizer()

In [16]:
def preprocess(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

In [17]:
data['question'] = data['question'].apply(preprocess)
data['answer'] = data['answer'].apply(preprocess)

In [19]:
# Tokenize and build vocabulary
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
vocab = set()
for question in data['question']:
    vocab.update(tokenizer.tokenize(question))
word2idx = {word: idx for idx, word in enumerate(vocab, 1)}
word2idx['<PAD>'] = 0

In [20]:
# Encode questions and answers
def encode(text):
    return [word2idx[word] for word in tokenizer.tokenize(text)]

In [21]:
data['encoded_questions'] = data['question'].apply(encode)
label_dict = {label: idx for idx, label in enumerate(data['answer'].unique())}
data['labels'] = data['answer'].map(label_dict)

In [22]:
# Padding
max_len = max(data['encoded_questions'].apply(len))

In [23]:
def pad_sequence(seq, max_len):
    return seq + [0] * (max_len - len(seq))

In [24]:
data['padded_questions'] = data['encoded_questions'].apply(lambda x: pad_sequence(x, max_len))

In [25]:
# Split the dataset into training, validation, and testing sets (70-15-15 split)
X_train_val, X_test, y_train_val, y_test = train_test_split(data['padded_questions'], data['labels'], test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=42) # 0.1765 * 0.85 ≈ 0.15

In [26]:
class QADataset(Dataset):
    def __init__(self, questions, labels):
        self.questions = torch.tensor(questions.tolist(), dtype=torch.long)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        return self.questions[idx], self.labels[idx]

train_dataset = QADataset(X_train, y_train)
val_dataset = QADataset(X_val, y_val)
test_dataset = QADataset(X_test, y_test)

In [27]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        return self.fc(hidden)

In [28]:
# Hyperparameters
vocab_size = len(word2idx)
embedding_dim = 100
hidden_dim = 256
output_dim = len(label_dict)
n_layers = 2
bidirectional = True
dropout = 0.5
batch_size = 16
num_epochs = 10

# Create the model
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training function
def train(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for questions, labels in loader:
        optimizer.zero_grad()
        predictions = model(questions)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

# Evaluation function
def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for questions, labels in loader:
            predictions = model(questions)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
            all_preds.extend(torch.argmax(predictions, dim=1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    return epoch_loss / len(loader), accuracy, precision, recall, f1

# Training loop
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss, val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, val_loader, criterion)
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.4f} | Val Precision: {val_precision:.4f} | Val Recall: {val_recall:.4f} | Val F1: {val_f1:.4f}')

# Final evaluation on test set
test_loss, test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f} | Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f} | Test F1: {test_f1:.4f}')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Train Loss: 9.4454
Val Loss: 8.8899 | Val Accuracy: 0.0232 | Val Precision: 0.0001 | Val Recall: 0.0004 | Val F1: 0.0002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10
Train Loss: 9.4334
Val Loss: 10.0785 | Val Accuracy: 0.0244 | Val Precision: 0.0001 | Val Recall: 0.0008 | Val F1: 0.0002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/10
Train Loss: 9.3063
Val Loss: 10.4479 | Val Accuracy: 0.0249 | Val Precision: 0.0001 | Val Recall: 0.0013 | Val F1: 0.0002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4/10
Train Loss: 9.1849
Val Loss: 10.8792 | Val Accuracy: 0.0249 | Val Precision: 0.0001 | Val Recall: 0.0013 | Val F1: 0.0002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5/10
Train Loss: 9.0554
Val Loss: 11.3220 | Val Accuracy: 0.0249 | Val Precision: 0.0001 | Val Recall: 0.0012 | Val F1: 0.0002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 6/10
Train Loss: 8.9044
Val Loss: 11.7530 | Val Accuracy: 0.0253 | Val Precision: 0.0001 | Val Recall: 0.0017 | Val F1: 0.0002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 7/10
Train Loss: 8.7534
Val Loss: 12.1335 | Val Accuracy: 0.0253 | Val Precision: 0.0001 | Val Recall: 0.0016 | Val F1: 0.0002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 8/10
Train Loss: 8.6289
Val Loss: 12.6056 | Val Accuracy: 0.0253 | Val Precision: 0.0001 | Val Recall: 0.0016 | Val F1: 0.0002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 9/10
Train Loss: 8.4659
Val Loss: 12.9002 | Val Accuracy: 0.0244 | Val Precision: 0.0001 | Val Recall: 0.0016 | Val F1: 0.0002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 10/10
Train Loss: 8.3247
Val Loss: 13.2919 | Val Accuracy: 0.0244 | Val Precision: 0.0001 | Val Recall: 0.0016 | Val F1: 0.0002
Test Loss: 13.2304 | Test Accuracy: 0.0306 | Test Precision: 0.0002 | Test Recall: 0.0020 | Test F1: 0.0003


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# Function to get the chatbot response
def chatbot_response(question):
    model.eval()
    question_preprocessed = preprocess(question)
    question_encoded = encode(question_preprocessed)
    question_padded = pad_sequence(question_encoded, max_len)
    question_tensor = torch.tensor([question_padded], dtype=torch.long)
    with torch.no_grad():
        output = model(question_tensor)
    answer_index = output.argmax(dim=-1).item()
    answer = list(label_dict.keys())[list(label_dict.values()).index(answer_index)]
    return answer

# Example interactions
print(chatbot_response("What is diabetes?"))
print(chatbot_response("How to treat hypertension?"))
print(chatbot_response("Symptoms of asthma?"))

# Save the model
torch.save(model.state_dict(), 'lstm_model.pth')

# Load the model (if needed)
model.load_state_dict(torch.load('lstm_model.pth'))

# Convert the CSV data to JSON format
data.to_json('data.json', orient='records')

# Function to update JSON with new QA pairs
def update_json(question, answer):
    with open('data.json', 'r+') as file:
        data = json.load(file)
        data.append({"question": question, "answer": answer})
        file.seek(0)
        json.dump(data, file, indent=4)

# Similarity Matching
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_question(user_question):
    user_question_preprocessed = preprocess(user_question)
    user_question_encoded = encode(user_question_preprocessed)
    user_question_padded = pad_sequence(user_question_encoded, max_len)
    user_question_vector = torch.tensor([user_question_padded], dtype=torch.float)
    all_questions_vectors = torch.tensor(data['padded_questions'].tolist(), dtype=torch.float)
    similarities = cosine_similarity(user_question_vector.numpy(), all_questions_vectors.numpy())
    max_sim_index = similarities.argmax()
    if similarities[0, max_sim_index] > 0.7:
        return data.iloc[max_sim_index]['answers']
    else:
        return "I don't know the answer to that question."

# Wikipedia API Integration
def compare_with_wikipedia(question, chatbot_answer):
    response = requests.get(f"https://en.wikipedia.org/api/rest_v1/page/summary/{question}")
    if response.status_code == 200:
        wiki_answer = response.json().get('extract', '')
        similarity = cosine_similarity(
            tokenizer.tokenize(chatbot_answer),
            tokenizer.tokenize(wiki_answer)
        )
        return similarity
    return 0

# Example of comparison with Wikipedia
wiki_similarity = compare_with_wikipedia("What is diabetes?", chatbot_response("What is diabetes?"))
print(f'Similarity with Wikipedia: {wiki_similarity}')

KeyError: 'What'