# Collect and Preprocess Data

In [1]:
import os

mp3_file_list = []

for root, dirs, files in os.walk('.'):
    for file in files:
        if file.endswith('.mp3'):
            mp3_file_list.append(os.path.join(root, file))

print(len(mp3_file_list))

for file in mp3_file_list[:10]:
    print(file)

395
./data/Alan Watts HUGE Collection/Zencast92.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 03 - The in defines the out defines the in.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 02 - Escaping the tangle.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 07 - Seeing past the illusion.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 06 - Answering the koan.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 04 - The Japanese Zen monastery.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 05 - Entering the temple.mp3
./data/Alan Watts HUGE Collection/09 Philosophy of the Tao/Alan Wat

In [2]:
import os

smallest_file = min(mp3_file_list, key=os.path.getsize)
print(smallest_file)

./data/Alan Watts HUGE Collection/Alan Watts - Still The Mind - Introduction To Meditation/alan watts - introduction to meditation - 9.mp3


In [3]:
import os

for root, dirs, files in os.walk('.'):
    for file in files:
        # if file.endswith('.ogg'):
        # if file.endswith('.png'):
        if file.endswith('.afpk'):
            os.remove(os.path.join(root, file))

# Transcribe Recordings

In [6]:
import whisper

model = whisper.load_model("medium")
# result = model.transcribe(mp3_file_list[0])
result = model.transcribe("./data/Alan Watts HUGE Collection/Alan Watts - Still The Mind - Introduction To Meditation/alan watts - introduction to meditation - 8.mp3")
print(result["text"].replace('. ', '.\n').replace('? ', '?\n').replace('! ', '!\n'))

100%|█████████████████████████████████████| 1.42G/1.42G [01:49<00:00, 13.9MiB/s]


 Now, can you actually hear anyone who is listening?
Can you hear any difference between all these sounds on the one hand and yourself on the other?
Now, when you were about to absorb into the sound, where were you?
This would be called a state of consciousness, where we have a primitive form of samadhi.
That is to say, we are happily absorbed in what we are doing, and we have forgotten about ourselves.
You can't very well do that and worry, or think anything serious.
And you'll notice that there's a special way of doing it, because, I mean, we can go crazy, and we can do kind of the wild Indian chants.
But in this, you are sort of straining too much, as a rule.
If you keep it down to a soft thing, like this, and get that flating feeling of the voice, if you instantly feel any sound is uncomfortable, avoid it.
Slip down if you're going too high, slip up if you're getting too low.
If your voice tends to change, follow its change.
So that you're just swinging along with it.
This is the p

In [8]:
with open("data/output.txt", "w") as f:
    for mp3_file in mp3_file_list:
        print(mp3_file)

        # transcribe the audio file
        result = model.transcribe(mp3_file)

        # append the transcription to an output file
        f.write("\n\n\n")
        f.write(result["text"].replace('. ', '.\n').replace('? ', '?\n').replace('! ', '!\n'))

./data/Alan Watts HUGE Collection/Zencast92.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 03 - The in defines the out defines the in.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 02 - Escaping the tangle.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 07 - Seeing past the illusion.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 06 - Answering the koan.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 04 - The Japanese Zen monastery.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 05 - Entering the temple.mp3
./data/Alan Watts HUGE Collection/09 Philosophy of the Tao/Alan Watts  

## Reprocess Bad Transcripts

In [2]:
import whisper
from tqdm.auto import tqdm

model = whisper.load_model("large")

In [3]:
# bad_files_list = [
#     "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD06 - The Inevitable Ecstacy - Part 2/Alan Watts - 06 - This is the game.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD06 - The Inevitable Ecstacy - Part 2/Alan Watts - 04 - The illusion of the ego.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD06 - The Inevitable Ecstacy - Part 2/Alan Watts - 05 - The meaningless life.mp3",
#     "./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - Tambouras for meditation.mp3",                         # NO USEFUL DATA
#     "./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - 0 Meditation 2.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD09 - The World As Self - Part 1/Alan Watts - 07 - Rules of the game.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD09 - The World As Self - Part 1/Alan Watts - 06 - The rhythmic dance.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Strange Prayers/7 the end.mp3"                                        # NO USEFUL DATA
# ]
# bad_files_list = ["./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - 0 Insight & Ecstacy.mp3"]
# bad_files_list = ["./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - The Bomb (Ching and Civilization).mp3"]
# bad_files_list = [
#     "./data/Alan Watts HUGE Collection/Unsorted/Alan Watts - We As Organism - pt1.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Out Of The Trap/[audio book] alan watts - out of the trap (4 of 4).mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - The Veil Of/2.mp3",
#     # "./data/Alan Watts HUGE Collection/Alan Watts - Game Theory Of Ethics/07 - game theory of ethics.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Buddhism - The Religion Of No Religion/alan watts - buddhism, religion of no religion 1#3-012.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Buddhism - The Religion Of No Religion/alan watts - buddhism, religion of no religion 1#3-010.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Buddhism - The Religion Of No Religion/alan watts - buddhism, religion of no religion 1#3-002.mp3",
#     "./data/Alan Watts HUGE Collection/Alan Watts - Who Is It That Knows There Is No Ego/alan watts - who is it who knows there is no ego -005.mp3"
#     ]
bad_files_list = [
    # "./data/Alan Watts HUGE Collection/Alan Watts - Buddhism - The Religion Of No Religion/alan watts - buddhism, religion of no religion 1#3-004.mp3"
    # "./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - Tambouras for meditation.mp3"
    # "./data/Alan Watts HUGE Collection/Alan Watts - Philosophies Of Asia/Taoist Way I.mp3"
    # "./data/Alan Watts HUGE Collection/Alan Watts - Philosophies Of Asia/Intro to Buddhism I.mp3"
    # "./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - Reflecting Mirror 3(inc).mp3"
    # "./data/Alan Watts HUGE Collection/02 Philosophies of Asia/Alan Watts  - 02 07 Philosophies of Asia - Taoist Way Of Karma.mp3"
    "./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - The Future - Time in the Future Pt 1 (entire).mp3"
]

with open('data/output_cleaned.txt', 'a') as f:
    for bad_file in tqdm(bad_files_list):
        print(bad_file)
        f.write("\n\n\n")
        result = model.transcribe(bad_file, language="en")
        print(result)
        print('=' * 200)
        f.write(result["text"].replace('. ', '.\n').replace('? ', '?\n').replace('! ', '!\n'))


  0%|          | 0/1 [00:00<?, ?it/s]

./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - The Future - Time in the Future Pt 1 (entire).mp3


## Get Starting Line Numbers for each Transcription

In [21]:
# Read the contents of output.txt
with open("data/output_final.txt", "r") as f:
    output_contents = f.read()

# Read the filenames from mp3_file_list.txt
with open("data/mp3_file_list.txt", "r") as f:
    filenames = f.readlines()

# Create a new file file_starts.txt
with open("data/file_starts.txt", "w") as f:
    newline_count = 0
    file_count = 0
    for linenum, line in enumerate(output_contents.split('\n')):
        if linenum == 0:
            continue
        if line == '':
            newline_count += 1
            if newline_count == 2:
                # if file_count == 0:
                f.write(f"{file_count + 1}) {linenum + 2} - {filenames[file_count]}")
                # else:
                #     f.write(f"{file_count + 1}) {linenum - 10} - {filenames[file_count]}")
                file_count += 1
                newline_count = 0

        # if line.startswith(' '):
        #     print(linenum)
        #     print(line)
        #     file_count += 1
        #     if file_count == 4:
        #         break
        #     # f.write(f"{file_count + 1}) {linenum + 1} - {filenames[file_count]}")
        #     # file_count += 1
    #     if line == '':
    #         newline_count += 1
    # print(linenum)
    # print(newline_count)
    # print(395 * 2 + 2)

In [1]:
import nltk
import string
nltk.download('words')
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import words
from nltk.tokenize import word_tokenize

# Get the list of English words
english_words = set(words.words())

def are_all_words_english(text):
    # Split the text into words
    word_list = text.split()

    # Check if each word is in the list of English words
    return all(word.lower() in english_words for word in word_list)

# def contains_non_english_words(line):
#     # Tokenize the line into words, ignoring punctuation
#     tokens = word_tokenize(line)
    
#     # Filter out punctuation
#     words_in_line = [word for word in tokens if word.isalpha()]
    
#     # Check if all words are in the English dictionary
#     return any(word.lower() not in english_words for word in words_in_line)

def contains_non_english_words(line):
    # Tokenize the line into words, ignoring punctuation
    tokens = word_tokenize(line)
    
    # Filter out punctuation
    words_in_line = [word for word in tokens if word.isalpha()]
    
    # Find words not in the English dictionary
    non_english_words = [word for word in words_in_line if word.lower() not in english_words]
    
    if non_english_words:
        print(f"Non-English words in line: {non_english_words}")
        return True
    return False

def filter_non_english_lines(text_lines):
    non_english_lines = []
    for line in text_lines:
        if contains_non_english_words(line):
            non_english_lines.append(line)
    return non_english_lines

[nltk_data] Downloading package words to
[nltk_data]     /home/peacelovephysics/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/peacelovephysics/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/peacelovephysics/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [20]:
# check if all lines after each set of two newlines start with a ' ' character
# def check_file_starts(text_lines):
#     newline_counter = 0
#     for index, line in enumerate(text_lines):
#         if line == '':
#             newline_counter += 1
#         if newline_counter == 1 and line != '':
#             print('a', index + 1, line)
#             newline_counter = 0
#         if newline_counter == 2:
#             newline_counter = 0
#             if not line.startswith(' '):
#                 print('b', index + 1, line)
def check_file_starts(text_lines):
    newline_counter = 0  # Counts consecutive newlines
    for index, line in enumerate(text_lines):
        if line == '':  # Empty line (newline)
            newline_counter += 1
        else:
            if newline_counter == 1:  # One newline
                if not line == '':
                    print('single newline (a)', index + 1, line)
            elif newline_counter == 2:  # Two consecutive newlines
                if not line.startswith(' '):
                    print('double newline (b)', index + 1, line)
            # if newline_counter == 2:  # Two consecutive newlines
            #     if not line.startswith(' '):
            #         print('b', index + 1, line)
            # Reset newline_counter after a non-empty line
            newline_counter = 0

# Read the contents of output_final.txt
with open("data/output_final.txt", "r") as f:
    output_final_contents = f.read()

# Split the contents into lines
output_final_lines = output_final_contents.split('\n')

# Check if all lines after each set of two newlines start with a ' ' character
check_file_starts(output_final_lines)

In [7]:
def get_lines(file):
    with open(file, "r") as f:
        return f.read().split('\n')

### Check for Single Line Transcriptions

In [15]:
# check for single lines that are not empty that are surrounded by empty lines
# def check_single_lines(text_lines):
#     line_counter = 0
#     line_flag = False
#     for index, line in enumerate(text_lines):
#         if line == '':
#             line_flag = True
#         if line_flag and line != '':
#             line_counter += 1
#         if line_counter == 1 and line == '':
#             print('single line', index + 1, line)
#             line_counter = 0
#             line_flag = False
def check_single_lines(text_lines):
    line_counter = 0
    in_paragraph = False

    for index, line in enumerate(text_lines):
        if line == '':  # Empty line indicating end of a paragraph
            if in_paragraph and line_counter == 1:
                print('single line paragraph at line', index, line)  # Print if the paragraph had only one line
            # Reset flags and counters after a paragraph ends
            line_counter = 0
            in_paragraph = False
        else:  # Non-empty line, so we're in a paragraph
            if not in_paragraph:
                in_paragraph = True
            line_counter += 1  # Increment line count within the paragraph

text_lines = get_lines('data/output_final.txt')

# print(text_lines[:5])

check_single_lines(text_lines)

single line paragraph at line 24924 


In [18]:
import string

# check if all the first lines are English
with open("data/output_final.txt", 'r') as f:
    output_contents = f.read()

with open("data/engmix.txt", 'r') as f:
    english_words = f.read().split('\n')

for linenum, line in enumerate(output_contents.split('\n')):
    if line.startswith(' '):
        bad_words = []
        for word in line.split():
            for char in string.punctuation:
                word = word.replace(char, '')
            if word.lower() not in english_words:
                bad_words.append(word)
        if bad_words:
            print(f"{linenum + 1}) {bad_words}")

# lines = output_contents.split('\n')

# # Find lines with non-English words
# non_english_lines = filter_non_english_lines(lines)

# # Output the results
# for line in non_english_lines:
#     print(f"Non-English line: {line}")

800) ['sanzen']
889) ['Im']
950) ['Roshi']
1198) ['aliveness', 'thats', 'Whats']
1214) ['socalled']
1237) ['isnt']
1427) ['nittygritty', 'weve']
1497) ['isnt']
1630) ['Im', 'hocuspocus']
2002) ['Im', 'koto']
2290) ['nonexistence']
3212) ['youve', 'Koyasan', 'Vajrayana', 'Mahayana', 'havent', 'theyre']
3380) ['youve', 'Koyasan', 'Vajrayana', 'Mahayana', 'havent', 'theyre']
3733) ['Bodhisattva', 'Mahayana', 'antiworldliness', 'nonBuddha']
3740) ['reemphasizing', 'dont', 'dont']
4422) ['50', 'Mahayana', 'subsect']
5340) ['Bodhisattva', 'Mahayana', 'antiworldliness', 'nonBuddha']
6175) ['reemphasizing', 'dont', 'dont', 'dont']
6590) ['50', 'Mahayana', 'subsect']
7025) ['youve', 'Koyasan', 'Vajrayana', 'Mahayana', 'havent', 'theyre']
7452) ['Im']
7814) ['youve', 'Koyasan', 'Vajrayana', 'Mahayana', 'havent', 'theyre']
8013) ['dont']
9227) ['malefemale', 'Tantric']
9434) ['doesnt']
9482) ['dont']
9897) ['dont', '16000', 'crosslegged', 'Buddhas', 'theres', 'Dharmadhatu', 'interrelatedness', 't

In [None]:
from langdetect import detect, LangDetectException

# Function to detect the language of paragraphs
def detect_non_english(text_lines):
    non_english_lines = []
    
    for index, line in enumerate(text_lines):
        # Skip empty lines
        if line.strip() == '':
            continue
        
        try:
            # Detect the language of the line
            lang = detect(line)
            if lang != 'en':
                non_english_lines.append((index + 1, line, lang))
        except LangDetectException:
            # If detection fails, log the line (it might be too short or ambiguous)
            non_english_lines.append((index + 1, line, 'undetected'))
    
    return non_english_lines

# Example usage
with open('data/output_final.txt', 'r') as f:
    text_lines = f.read().split('\n')

# Detect and list non-English paragraphs or lines
non_english_lines = detect_non_english(text_lines)

# Print out the lines with non-English content
for line_number, content, detected_lang in non_english_lines:
    print(f"Line {line_number}: Detected as {detected_lang} -> {content}")


### TODO: Incorporate Written Works

# Train Language Model

In [2]:
# from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
# from datasets import load_dataset

# # Load pre-trained BERT tokenizer and model
# print("loading tokenizer and model")
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# # Load and preprocess dataset
# print("loading dataset")
# dataset = load_dataset('text', data_files={'train': 'data/output_final.txt'}, split='train')

# # def tokenize_function(examples):
# #     return tokenizer(examples['text'], padding='max_length', truncation=True)

# def tokenize_function(examples):
#     return tokenizer(
#         examples['text'], 
#         padding='max_length', 
#         truncation=True,
#     )


# print("tokenizing dataset")
# tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# # Define training arguments
# print("defining training arguments")
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
# )

# # Initialize Trainer
# print("initializing trainer")
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets['train'],
# )

# # Fine-tune the model
# print("fine-tuning model")
# trainer.train()

# # Save the fine-tuned model
# print("saving model")
# model.save_pretrained('./fine-tuned-bert')
# tokenizer.save_pretrained('./fine-tuned-bert')

In [4]:
# from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
# from datasets import load_dataset, DatasetDict

# # Load pre-trained BERT tokenizer and model
# print("loading tokenizer and model")
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# # Load and preprocess dataset
# print("loading dataset")
# dataset = load_dataset('text', data_files={'train': 'data/output_final.txt'}, split='train')

# def tokenize_function(examples):
#     return tokenizer(
#         examples['text'], 
#         padding='max_length', 
#         truncation=True,
#     )

# # Calculate the split indices
# total_size = len(dataset)
# train_size = int(0.8 * total_size)  # 80% for training
# val_size = int(0.1 * total_size)    # 10% for validation
# test_size = total_size - train_size - val_size  # 10% for testing

# # Create the splits manually without shuffling
# train_dataset = dataset.select(range(train_size))
# val_dataset = dataset.select(range(train_size, train_size + val_size))
# test_dataset = dataset.select(range(train_size + val_size, total_size))

# # Combine into a DatasetDict
# final_dataset = DatasetDict({
#     'train': train_dataset,
#     'validation': val_dataset,
#     'test': test_dataset
# })

# # Tokenize the datasets
# print("tokenizing dataset")
# tokenized_datasets = final_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# # Define training arguments
# print("defining training arguments")
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     evaluation_strategy="epoch",  # Evaluate at the end of each epoch
#     save_strategy="epoch",        # Save the model at the end of each epoch
#     load_best_model_at_end=True,  # Load the best model when done
#     metric_for_best_model="accuracy",  # Use accuracy to select the best model
# )

# # Initialize Trainer
# print("initializing trainer")
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets['train'],
#     eval_dataset=tokenized_datasets['validation'],  # Add validation dataset for monitoring
# )

# # Fine-tune the model
# print("fine-tuning model")
# trainer.train()

# # Evaluate the model on the test set
# print("evaluating model")
# results = trainer.evaluate(tokenized_datasets['test'])

# print(f"Test set results: {results}")

# # Save the fine-tuned model
# print("saving model")
# model.save_pretrained('./fine-tuned-bert')
# tokenizer.save_pretrained('./fine-tuned-bert')


In [6]:
# from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
# from datasets import load_dataset, DatasetDict

# # Load pre-trained BERT tokenizer and model
# print("loading tokenizer and model")
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)
# model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# # Load and preprocess dataset
# print("loading dataset")
# dataset = load_dataset('text', data_files={'train': 'data/output_final.txt'}, split='train')

# # Calculate the split indices
# total_size = len(dataset)
# train_size = int(0.8 * total_size)  # 80% for training
# val_size = int(0.1 * total_size)    # 10% for validation
# test_size = total_size - train_size - val_size  # 10% for testing

# # Create the splits manually without shuffling
# train_dataset = dataset.select(range(train_size))
# val_dataset = dataset.select(range(train_size, train_size + val_size))
# test_dataset = dataset.select(range(train_size + val_size, total_size))

# # Combine into a DatasetDict
# final_dataset = DatasetDict({
#     'train': train_dataset,
#     'validation': val_dataset,
#     'test': test_dataset
# })

# # Tokenize the datasets
# print("tokenizing dataset")
# def tokenize_function(examples):
#     return tokenizer(
#         examples['text'], 
#         padding='max_length', 
#         truncation=True,
#         max_length=512,  # or another length that makes sense for your data
#         return_special_tokens_mask=True  # useful for masked language modeling
#     )

# tokenized_datasets = final_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# # Define training arguments
# print("defining training arguments")
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     evaluation_strategy="epoch",  # Evaluate at the end of each epoch
#     save_strategy="epoch",        # Save the model at the end of each epoch
#     load_best_model_at_end=True,  # Load the best model when done
# )

# # Initialize Trainer
# print("initializing trainer")
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets['train'],
#     eval_dataset=tokenized_datasets['validation'],  # Add validation dataset for monitoring
# )

# # Fine-tune the model
# print("fine-tuning model")
# trainer.train()

# # Evaluate the model on the test set
# print("evaluating model")
# results = trainer.evaluate(tokenized_datasets['test'])

# print(f"Test set results: {results}")

# # Save the fine-tuned model
# print("saving model")
# model.save_pretrained('./fine-tuned-bert')
# tokenizer.save_pretrained('./fine-tuned-bert')


In [8]:
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, DatasetDict

# Load pre-trained BERT tokenizer and model
print("loading tokenizer and model")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Load and preprocess dataset
print("loading dataset")
dataset = load_dataset('text', data_files={'train': 'data/output_final.txt'}, split='train')

# Calculate the split indices
total_size = len(dataset)
train_size = int(0.8 * total_size)  # 80% for training
val_size = int(0.1 * total_size)    # 10% for validation
test_size = total_size - train_size - val_size  # 10% for testing

# Create the splits manually without shuffling
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, total_size))

# Combine into a DatasetDict
final_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Tokenize the datasets
print("tokenizing dataset")
def tokenize_function(examples):
    return tokenizer(
        examples['text'], 
        padding='max_length', 
        truncation=True,
        max_length=512
    )

tokenized_datasets = final_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # 15% of the tokens will be masked
)

# Define training arguments
# print("defining training arguments")
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     evaluation_strategy="epoch",  # Evaluate at the end of each epoch
#     save_strategy="epoch",        # Save the model at the end of each epoch
#     load_best_model_at_end=True,  # Load the best model when done
# )

print("defining training arguments")
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,  # Adjust as needed
    logging_dir='./logs',
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model when done
    learning_rate=5e-5,  # Lower learning rate
    max_grad_norm=1.0,   # Prevent exploding gradients
    logging_steps=500,
    save_total_limit=1,  # Keep only the best model checkpoint
)

# Initialize Trainer
print("initializing trainer")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator  # Pass the data collator for MLM
)

# Fine-tune the model
print("fine-tuning model")
trainer.train()

# Evaluate the model on the test set
print("evaluating model")
results = trainer.evaluate(tokenized_datasets['test'])

print(f"Test set results: {results}")

# Save the fine-tuned model
print("saving model")
model.save_pretrained('./fine-tuned-bert2')
tokenizer.save_pretrained('./fine-tuned-bert2')


loading tokenizer and model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


loading dataset
tokenizing dataset
defining training arguments
initializing trainer
fine-tuning model


Epoch,Training Loss,Validation Loss
1,2.1732,
2,2.0421,
3,1.8707,1.85039
4,1.7774,1.788857
5,1.5972,1.742944
6,1.545,1.679608
7,1.4523,
8,1.3583,1.622527
9,1.351,1.50053
10,1.3273,1.560745


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


evaluating model


Test set results: {'eval_loss': 2.1002368927001953, 'eval_runtime': 687.7496, 'eval_samples_per_second': 8.484, 'eval_steps_per_second': 1.061, 'epoch': 10.0}
saving model


('./fine-tuned-bert2/tokenizer_config.json',
 './fine-tuned-bert2/special_tokens_map.json',
 './fine-tuned-bert2/vocab.txt',
 './fine-tuned-bert2/added_tokens.json')

## With Seq2Seq

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from gtts import gTTS
import os
import torch
from typing import List, Dict

# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', clean_up_tokenization_spaces=True)
model = GPT2LMHeadModel.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

# Load and preprocess dataset from a .txt file
dataset = load_dataset('text', data_files={'train': './data/output_final.txt'}, split='train')

# Manually split the dataset into training, validation, and test sets while preserving order
split_ratio = 0.15  # 15% for validation, 15% for test
split_index_val = int(len(dataset) * (1 - split_ratio))
split_index_test = int(len(dataset) * (1 - 2 * split_ratio))
train_dataset = Dataset.from_dict(dataset[:split_index_test])
val_dataset = Dataset.from_dict(dataset[split_index_test:split_index_val])
test_dataset = Dataset.from_dict(dataset[split_index_val:])

# def preprocess_function(examples):
#     return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

def preprocess_function(examples: List[str]) -> Dict[str, torch.Tensor]:
    inputs = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
    
    # Create labels and set padding tokens in labels to -100 to ignore them in the loss
    inputs['labels'] = inputs['input_ids'].copy()
    inputs['labels'] = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in inputs['labels']]
    
    return inputs

# tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["text"])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./resultsGPT2b',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logsGPT2b',
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model when done
    learning_rate=5e-5,  # Lower learning rate
    max_grad_norm=1.0,   # Prevent exploding gradients
    logging_steps=500,
    save_total_limit=1,  # Keep only the best model checkpoint
)

# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=10,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,  # Adjust as needed
#     logging_dir='./logs',
#     eval_strategy="epoch",  # Evaluate at the end of each epoch
#     save_strategy="epoch",        # Save the model at the end of each epoch
#     load_best_model_at_end=True,  # Load the best model when done
#     learning_rate=5e-5,  # Lower learning rate
#     max_grad_norm=1.0,   # Prevent exploding gradients
#     logging_steps=500,
#     save_total_limit=1,  # Keep only the best model checkpoint
# )

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=tokenized_datasets,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')

# Evaluate the model on the test set
trainer.evaluate(eval_dataset=tokenized_test_dataset)

2024-09-13 12:14:00.456652: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-13 12:14:00.564564: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-13 12:14:00.976011: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-09-13 12:14:00.976137: W tensorflow/compiler/xla/stream_executor/platform/default/dso_lo

Map:   0%|          | 0/40836 [00:00<?, ? examples/s]

Map:   0%|          | 0/8751 [00:00<?, ? examples/s]

Map:   0%|          | 0/8751 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,3.3633,3.259553
2,3.0313,3.158909
3,2.8012,3.148474


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'eval_loss': 3.168238401412964,
 'eval_runtime': 735.9012,
 'eval_samples_per_second': 11.892,
 'eval_steps_per_second': 1.487,
 'epoch': 3.0}

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-gpt2')

# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

# Define additional training arguments
training_args = TrainingArguments(
    output_dir='./resultsGPT2c',
    num_train_epochs=10,  # Continue with additional epochs
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logsGPT2c',
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    max_grad_norm=1.0,
    logging_steps=500,
    save_total_limit=1,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
)

# Continue fine-tuning the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine-tuned-gpt2-continued')
tokenizer.save_pretrained('./fine-tuned-gpt2-continued')


Epoch,Training Loss,Validation Loss
1,2.6251,3.140254
2,2.4462,3.087585
3,2.2909,3.063362
4,2.076,3.045647
5,1.8923,3.125189
6,1.7339,3.129887
7,1.573,3.179892
8,1.503,3.188739
9,1.4249,3.21861
10,1.3843,3.250127


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


('./fine-tuned-gpt2-continued/tokenizer_config.json',
 './fine-tuned-gpt2-continued/special_tokens_map.json',
 './fine-tuned-gpt2-continued/vocab.json',
 './fine-tuned-gpt2-continued/merges.txt',
 './fine-tuned-gpt2-continued/added_tokens.json')

In [3]:
trainer.evaluate(eval_dataset=tokenized_test_dataset)

{'eval_loss': 3.064765691757202,
 'eval_runtime': 441.2436,
 'eval_samples_per_second': 19.833,
 'eval_steps_per_second': 2.479,
 'epoch': 10.0}

In [5]:
# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-gpt2-continued')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-gpt2-continued')

def generate_response(user_input):
    inputs = tokenizer.encode(user_input, return_tensors='pt')
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return response

# Example usage
user_input = "What is the meaning of life?"
response_text = generate_response(user_input)
print(response_text)

What is the meaning of life? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of God? The meaning of


In [6]:
user_input = "What is the nature of God?"
response_text = generate_response(user_input)
print(response_text)

What is the nature of God? Or is it just as good as the nature of man? Or is it just as bad? Or is it just as immoral? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as stupid? Or is it just as


In [None]:
def text_to_speech(text, filename='response.mp3'):
    tts = gTTS(text=text, lang='en')
    tts.save(filename)
    os.system(f"mpg321 {filename}")
    
text_to_speech(response_text)

# Implement Chatbot

In [17]:
from transformers import BertTokenizer, BertForSequenceClassification, BertLMHeadModel

# Load the fine-tuned model and tokenizer
# model = BertForSequenceClassification.from_pretrained('./fine-tuned-bert2')
model = BertLMHeadModel.from_pretrained('./fine-tuned-bert2')
tokenizer = BertTokenizer.from_pretrained('./fine-tuned-bert2')

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [10]:
def preprocess_input(user_input):
    inputs = tokenizer(user_input, return_tensors='pt', padding=True, truncation=True, max_length=512)
    return inputs

In [14]:
import torch

def generate_response(user_input):
    inputs = preprocess_input(user_input)
    with torch.no_grad():
        outputs = model(**inputs)
    print(outputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    return predicted_class_id

In [12]:
# Example mapping of class IDs to responses
response_map = {
    0: "Response for class 0",
    1: "Response for class 1",
    # Add more mappings as needed
}

def get_response_text(predicted_class_id):
    return response_map.get(predicted_class_id, "Sorry, I don't understand.")

In [15]:
def chat_bot_response(user_input):
    predicted_class_id = generate_response(user_input)
    response_text = get_response_text(predicted_class_id)
    return response_text

# Example usage
user_input = "What is the meaning of life?"
response = chat_bot_response(user_input)
print(response)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0695, -0.1936]]), hidden_states=None, attentions=None)
Response for class 0


In [16]:
def generate_response(user_input):
    inputs = tokenizer.encode(user_input, return_tensors='pt')
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example usage
user_input = "What is the meaning of life?"
response_text = generate_response(user_input)
print(response_text)

TypeError: The current model class (BertForSequenceClassification) is not compatible with `.generate()`, as it doesn't have a language model head. Please use one of the following classes instead: {'BertLMHeadModel'}

# Text-to-Speech Synthesis

# Combine Audio and Play