In [10]:
import tensorflow as tf
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from tqdm import tqdm
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import classification_report
import json
from itertools import islice
import nltk
import spacy
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [12]:
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device in use
print(f"Model is running on: {device}")

Model is running on: cuda


In [14]:
# Read train data
with open(r"train-v1.1.json", "r") as read_file:
    train = json.load(read_file)

# Read test data
with open(r"dev-v1.1.json", "r") as read_file:
    test = json.load(read_file)

In [15]:
# Check top-level keys
print(train.keys())  

dict_keys(['data', 'version'])


In [18]:
# Initialize a counter to track the number of interactions printed
interaction_count = 0
max_interactions = 5  # Set the limit to 5

for article in train['data']:
    print("Title:", article['title'])
    for paragraph in article['paragraphs']:
        print("Context:", paragraph['context'][:100], "...")  #
        for qa in paragraph['qas']:
            print("Question:", qa['question'])
            for answer in qa['answers']:
                print("Answer:", answer['text'])
            print()  
            interaction_count += 1
            
            # Break if we reach the limit of 5 interactions
            if interaction_count >= max_interactions:
                break
        if interaction_count >= max_interactions:
            break
    if interaction_count >= max_interactions:
        break

Title: University_of_Notre_Dame
Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden ...
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer: Saint Bernadette Soubirous

Question: What is in front of the Notre Dame Main Building?
Answer: a copper statue of Christ

Question: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Answer: the Main Building

Question: What is the Grotto at Notre Dame?
Answer: a Marian place of prayer and reflection

Question: What sits on top of the Main Building at Notre Dame?
Answer: a golden statue of the Virgin Mary



In [20]:
# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer
nltk.download('stopwords')  # Stop words

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Initialize NLTK stemmer
stemmer = PorterStemmer()

# NLTK stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Lemmatize and remove stopwords with spaCy
    doc = nlp(' '.join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc if token.text not in stop_words]
    
    # Stem tokens using NLTK
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    
    return ' '.join(stemmed_tokens)

# Preprocess an example paragraph
for article in train['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        preprocessed_context = preprocess_text(context)
        print("Original Context:", context[:100], "...")  # Print first 100 chars for readability
        print("Preprocessed Context:", preprocessed_context[:100], "...")  # Print first 100 chars for readability
        break  # Limiting to 1 iteration for demonstration
    break

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sassy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sassy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden ...
Preprocessed Context: architectur school cathol charact atop main build gold dome golden statu virgin mari immedi front ma ...
