In [1]:
pip install --upgrade datasets


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install evaluate

Note: you may need to restart the kernel to use updated packages.


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
ds = load_dataset("toughdata/quora-question-answer-dataset")
ds = ds['train'].train_test_split(test_size = 0.4)

In [6]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 33841
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 22561
    })
})

In [8]:
ds['train'][slice(None, 5, None)]

{'question': ['What would happen if Led Zeppelin went on tour and replaced Robert Plant with Eddie Vedder?',
  'As a stay-at-home parent, how much help do you get from your partner when running the home and looking after the kids?',
  'How does a wave function collapse in quantum mechanics/physics?',
  'What do teenagers wish adults understood?',
  "Is the claim that Palestinian ethnicity doesn't exist, and is actually a mix of different Arab nationalities (Jordanian, etc) just a false conspiracy theory?"],
 'answer': ['A2a\n It would be an interesting but failed experiment. I don’t see Ed’s vocals as similar to Plant in any way.\n Plus Ed has clearly aligned himself with the Who as the inspirational band from that era for him.\n Makes no real sense to do this.\n',
  "It depends! My husband and I have 3 kids (6, 3 and 6 months). I'll say this first, he doesn't wash dishes, do bath time with the kids, nor does he do laundry.\n Somedays he will clean up after supper, an odd time he will 

In [9]:
ds['test'][slice(None, 5, None)]

{'question': ["Is attending your youngest daughter's wedding in direct violation of a protective/restraining order a bad idea even if you have a well-rehearsed apology speech at-the-ready?",
  'What is your favorite sci-fi book series, and why?',
  'Was Iran/Persia powerful before? Was it full of barbarians as mentioned in the movie “300”? If it was, then did Persians raise the three empires of Achamedians, Safavid, and Sasanian?',
  'What is the best Bollywood song for caller tune?',
  'Why do some people like to stay in an abusive relationship?'],
 'answer': ['The fact that someone would even consider doing this is a great example of why the restraining order was put in place.\n This is their day. Not yours. You do not have a single right to interrupt it in any way, shape or form.\n I can only presume you put your needs over and above your child’s consistently in the past.\n You appear to have learnt nothing.\n She sees you as a sperm donor not a father because you have clearly never

In [10]:
#cleaning the text
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', '', text).strip()
    return text
    

In [11]:
# remove Stop Words
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

In [12]:
def tokenize(text):
    return word_tokenize(text)

In [13]:
def stem(tokens):
    return [stemmer.stem(token) for token in tokens]

In [14]:
def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [15]:
#apply the cleaning func
#def preprocess_func(temp):
#    temp['processed_question'] = remove_stopwords(clean_text(temp['question']))
#   temp['processed_question'] = remove_stopwords(clean_text(temp['answer']))
#   return temp
#processed_ds = ds.map(preprocess_func)

In [16]:
def preprocess_func(example):
    # Clean the text
    cleaned_question = clean_text(example['question'])
    cleaned_answer = clean_text(example['answer'])
    
    # Remove stop words
    cleaned_question = remove_stopwords(cleaned_question)
    cleaned_answer = remove_stopwords(cleaned_answer)

    # Tokenize
    question_tokens = tokenize(cleaned_question)
    answer_tokens = tokenize(cleaned_answer)
    
    # Apply stemming or lemmatization (uncomment the desired method)
     #question_tokens = stem(question_tokens)
    # answer_tokens = stem(answer_tokens)
    
    question_tokens = lemmatize(question_tokens)
    answer_tokens = lemmatize(answer_tokens)

    # Join tokens back into strings
    example['processed_question'] = ' '.join(question_tokens)
    example['processed_answer'] = ' '.join(answer_tokens)
    
    return example


In [17]:
for split_name, split_data in ds.items():
    processed_ds = split_data.map(preprocess_func)
    
    # Convert to pandas DataFrame
    cleaned_df = processed_ds.to_pandas()
    
    # Drop original columns
    cleaned_df = cleaned_df.drop(columns=['question', 'answer'])
    
    # Save the cleaned dataset to CSV
    cleaned_df.to_csv(f'cleaned_{split_name}_dataset.csv', index=False)
    
    print(f"\nFirst 5 rows of the cleaned {split_name} dataset:")
    print(cleaned_df.head())

Map:   0%|          | 0/33841 [00:00<?, ? examples/s]


First 5 rows of the cleaned train dataset:
                                  processed_question  \
0  whatwouldhappenifledzeppelinwentontourandrepla...   
1  asastayathomeparenthowmuchhelpdoyougetfromyour...   
2  howdoesawavefunctioncollapseinquantummechanics...   
3                whatdoteenagerswishadultsunderstood   
4  istheclaimthatpalestinianethnicitydoesntexista...   

                                    processed_answer  
0  aaitwouldbeaninterestingbutfailedexperimentido...  
1  itdependsmyhusbandandihavekidsandmonthsillsayt...  
2  itdoesntwavefunctioncollapseisnotaphysicalproc...  
3  godihavesomanybutheresanewonesoyourenotreading...  
4  whilethereisjordanasanationalhomelandofthejord...  


Map:   0%|          | 0/22561 [00:00<?, ? examples/s]


First 5 rows of the cleaned test dataset:
                                  processed_question  \
0  isattendingyouryoungestdaughtersweddingindirec...   
1            whatisyourfavoritescifibookseriesandwhy   
2  wasiranpersiapowerfulbeforewasitfullofbarbaria...   
3            whatisthebestbollywoodsongforcallertune   
4   whydosomepeopleliketostayinanabusiverelationship   

                                    processed_answer  
0  thefactthatsomeonewouldevenconsiderdoingthisis...  
1  therearemanyacclaimedsciencefictionnovelsthath...  
2  thetermbarbariansisequalwithaniranianforpersia...  
3                           mainphirbhitumkochahunga  
4  loveisareasonlilythinksofittherightwayjustbeca...  


In [18]:
pip install transformers datasets torch 

Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install datasets


Note: you may need to restart the kernel to use updated packages.


In [20]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments
import torch
from datasets import Dataset



In [21]:
#load the processed data 
train_df = pd.read_csv('C:/Users/Adnan/cleaned_train_dataset.csv')
test_df = pd.read_csv('C:/Users/Adnan/cleaned_test_dataset.csv')


In [22]:
# Prepare the dataset for BERT2BERT
train_df['input_text'] = train_df['processed_question']
train_df['target_text'] = train_df['processed_answer']
test_df['input_text'] = test_df['processed_question']
test_df['target_text'] = test_df['processed_answer']


In [23]:
# Split the dataset into smaller subsets
num_splits = 4  # Define the number of splits
train_subsets = np.array_split(train_df[['input_text', 'target_text']], num_splits)
test_subsets = np.array_split(test_df[['input_text', 'target_text']], num_splits)


In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [25]:
# Define a function to tokenize the datasets
def tokenize_function(examples):
    input_texts = [str(text) for text in examples['input_text']]
    target_texts = [str(text) for text in examples['target_text']]
    
    model_inputs = tokenizer(input_texts, max_length=512, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_texts, max_length=512, truncation=True, padding='max_length')
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
# Train multiple models in parallel on the subsets
models = []

for i, (train_subset, test_subset) in enumerate(zip(train_subsets, test_subsets)):
    train_dataset = Dataset.from_pandas(train_subset.reset_index(drop=True))
    test_dataset = Dataset.from_pandas(test_subset.reset_index(drop=True))
    
    train_tokenized = train_dataset.map(tokenize_function, batched=True)
    test_tokenized = test_dataset.map(tokenize_function, batched=True)


model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

training_args = TrainingArguments(
    
    output_dir=f'./results_{i}',
    num_train_epochs=1,  # Reduced epochs for quicker training
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'./logs_{i}',
    logging_steps=10,
    fp16=torch.cuda.is_available(),  # Enable fp16 only if GPU is available
    save_steps=1000,
    save_total_limit=2,
    dataloader_num_workers=4  # Use multiple workers for data loading
    
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
)

trainer.train()
eval_results = trainer.evaluate()
print(f"Results for model {i}: {eval_results}")

model.save_pretrained(f'./bert2bert-chatbot-model_{i}')
tokenizer.save_pretrained(f'./bert2bert-chatbot-model_{i}')
models.append(model)


Map:   0%|          | 0/8461 [00:00<?, ? examples/s]



Map:   0%|          | 0/5641 [00:00<?, ? examples/s]

Map:   0%|          | 0/8460 [00:00<?, ? examples/s]

Map:   0%|          | 0/5640 [00:00<?, ? examples/s]

Map:   0%|          | 0/8460 [00:00<?, ? examples/s]

Map:   0%|          | 0/5640 [00:00<?, ? examples/s]

Map:   0%|          | 0/8460 [00:00<?, ? examples/s]

Map:   0%|          | 0/5640 [00:00<?, ? examples/s]



Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss


In [None]:
def generate_response(question, model, tokenizer):
    input_text = question
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=512, pad_token_id=tokenizer.pad_token_id)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response
        
            

In [None]:
# Example usage with the first model
best_model = models[0]
best_tokenizer = tokenizer
print(generate_response("What is the capital of France?", best_model, best_tokenizer))