# Collect and Preprocess Data

In [1]:
import os

mp3_file_list = []

for root, dirs, files in os.walk('.'):
    for file in files:
        if file.endswith('.mp3'):
            mp3_file_list.append(os.path.join(root, file))

print(len(mp3_file_list))

for file in mp3_file_list[:10]:
    print(file)

395
./data/Alan Watts HUGE Collection/Zencast92.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 03 - The in defines the out defines the in.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 02 - Escaping the tangle.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 07 - Seeing past the illusion.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 06 - Answering the koan.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 04 - The Japanese Zen monastery.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 05 - Entering the temple.mp3
./data/Alan Watts HUGE Collection/09 Philosophy of the Tao/Alan Wat

In [2]:
import os

smallest_file = min(mp3_file_list, key=os.path.getsize)
print(smallest_file)

./data/Alan Watts HUGE Collection/Alan Watts - Still The Mind - Introduction To Meditation/alan watts - introduction to meditation - 9.mp3


In [3]:
import os

for root, dirs, files in os.walk('.'):
    for file in files:
        # if file.endswith('.ogg'):
        # if file.endswith('.png'):
        if file.endswith('.afpk'):
            os.remove(os.path.join(root, file))

# Transcribe Recordings

In [6]:
import whisper

model = whisper.load_model("medium")
# result = model.transcribe(mp3_file_list[0])
result = model.transcribe("./data/Alan Watts HUGE Collection/Alan Watts - Still The Mind - Introduction To Meditation/alan watts - introduction to meditation - 8.mp3")
print(result["text"].replace('. ', '.\n').replace('? ', '?\n').replace('! ', '!\n'))

100%|█████████████████████████████████████| 1.42G/1.42G [01:49<00:00, 13.9MiB/s]


 Now, can you actually hear anyone who is listening?
Can you hear any difference between all these sounds on the one hand and yourself on the other?
Now, when you were about to absorb into the sound, where were you?
This would be called a state of consciousness, where we have a primitive form of samadhi.
That is to say, we are happily absorbed in what we are doing, and we have forgotten about ourselves.
You can't very well do that and worry, or think anything serious.
And you'll notice that there's a special way of doing it, because, I mean, we can go crazy, and we can do kind of the wild Indian chants.
But in this, you are sort of straining too much, as a rule.
If you keep it down to a soft thing, like this, and get that flating feeling of the voice, if you instantly feel any sound is uncomfortable, avoid it.
Slip down if you're going too high, slip up if you're getting too low.
If your voice tends to change, follow its change.
So that you're just swinging along with it.
This is the p

In [8]:
with open("data/output.txt", "w") as f:
    for mp3_file in mp3_file_list:
        print(mp3_file)

        # transcribe the audio file
        result = model.transcribe(mp3_file)

        # append the transcription to an output file
        f.write("\n\n\n")
        f.write(result["text"].replace('. ', '.\n').replace('? ', '?\n').replace('! ', '!\n'))

./data/Alan Watts HUGE Collection/Zencast92.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 03 - The in defines the out defines the in.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 02 - Escaping the tangle.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 07 - Seeing past the illusion.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 06 - Answering the koan.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 04 - The Japanese Zen monastery.mp3
./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD08 - The World Just So - Part 2/Alan Watts - 05 - Entering the temple.mp3
./data/Alan Watts HUGE Collection/09 Philosophy of the Tao/Alan Watts  

## Reprocess Bad Transcripts

In [7]:
import whisper
from tqdm.auto import tqdm

model = whisper.load_model("large")

bad_files_list = [
    "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD06 - The Inevitable Ecstacy - Part 2/Alan Watts - 06 - This is the game.mp3",
    "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD06 - The Inevitable Ecstacy - Part 2/Alan Watts - 04 - The illusion of the ego.mp3",
    "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD06 - The Inevitable Ecstacy - Part 2/Alan Watts - 05 - The meaningless life.mp3",
    "./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - Tambouras for meditation.mp3",
    "./data/Alan Watts HUGE Collection/Misc Unsorted/Alan Watts  - 0 Meditation 2.mp3",
    "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD09 - The World As Self - Part 1/Alan Watts - 07 - Rules of the game.mp3",
    "./data/Alan Watts HUGE Collection/Alan Watts - Out Of Your Mind - CD09 - The World As Self - Part 1/Alan Watts - 06 - The rhythmic dance.mp3",
    "./data/Alan Watts HUGE Collection/Alan Watts - Strange Prayers/7 the end.mp3"
]

with open('data/output_cleaned.txt', 'w') as f:
    for bad_file in tqdm(bad_files_list):
        print(bad_file)
        result = model.transcribe(bad_file)
        f.write("\n\n\n")
        f.write(result["text"].replace('. ', '.\n').replace('? ', '?\n').replace('! ', '!\n'))


 82%|██████████████████████████████▍      | 2.36G/2.88G [03:09<01:05, 8.45MiB/s]

## Get Starting Line Numbers for each Transcription

In [28]:
# Read the contents of output.txt
with open("data/output.txt", "r") as f:
    output_contents = f.read()

# Read the filenames from mp3_file_list.txt
with open("data/mp3_file_list.txt", "r") as f:
    filenames = f.readlines()

# Create a new file file_starts.txt
with open("data/file_starts.txt", "w") as f:
    newline_count = 0
    file_count = 0
    for linenum, line in enumerate(output_contents.split('\n')):
        if linenum == 0:
            continue
        if line == '':
            newline_count += 1
            if newline_count == 2:
                if file_count == 0:
                    f.write(f"{file_count + 1}) {linenum + 2} - {filenames[file_count]}")
                else:
                    f.write(f"{file_count + 1}) {linenum - 10} - {filenames[file_count]}")
                file_count += 1
                newline_count = 0

        # if line.startswith(' '):
        #     print(linenum)
        #     print(line)
        #     file_count += 1
        #     if file_count == 4:
        #         break
        #     # f.write(f"{file_count + 1}) {linenum + 1} - {filenames[file_count]}")
        #     # file_count += 1
    #     if line == '':
    #         newline_count += 1
    # print(linenum)
    # print(newline_count)
    # print(395 * 2 + 2)

In [1]:
import nltk
import string
nltk.download('words')
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import words
from nltk.tokenize import word_tokenize

# Get the list of English words
english_words = set(words.words())

def are_all_words_english(text):
    # Split the text into words
    word_list = text.split()

    # Check if each word is in the list of English words
    return all(word.lower() in english_words for word in word_list)

# def contains_non_english_words(line):
#     # Tokenize the line into words, ignoring punctuation
#     tokens = word_tokenize(line)
    
#     # Filter out punctuation
#     words_in_line = [word for word in tokens if word.isalpha()]
    
#     # Check if all words are in the English dictionary
#     return any(word.lower() not in english_words for word in words_in_line)

def contains_non_english_words(line):
    # Tokenize the line into words, ignoring punctuation
    tokens = word_tokenize(line)
    
    # Filter out punctuation
    words_in_line = [word for word in tokens if word.isalpha()]
    
    # Find words not in the English dictionary
    non_english_words = [word for word in words_in_line if word.lower() not in english_words]
    
    if non_english_words:
        print(f"Non-English words in line: {non_english_words}")
        return True
    return False

def filter_non_english_lines(text_lines):
    non_english_lines = []
    for line in text_lines:
        if contains_non_english_words(line):
            non_english_lines.append(line)
    return non_english_lines

[nltk_data] Downloading package words to
[nltk_data]     /home/peacelovephysics/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/peacelovephysics/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/peacelovephysics/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
import enchant

# Initialize the English dictionary
english_dict = enchant.Dict("en_US")

def contains_non_english_words(line):
    # Tokenize the line into words, ignoring punctuation
    tokens = word_tokenize(line)
    
    # Filter out punctuation and check if each word is in the dictionary
    non_english_words = [word for word in tokens if word.isalpha() and not english_dict.check(word)]
    
    if non_english_words:
        print(f"Non-English words in line: {non_english_words}")
        return True
    return False

ImportError: The 'enchant' C library was not found and maybe needs to be installed.
See  https://pyenchant.github.io/pyenchant/install.html
for details


In [5]:
# check if all the first lines are English
with open("data/output.txt", 'r') as f:
    output_contents = f.read()

with open("data/engmix.txt", 'r') as f:
    english_words = f.read().split('\n')

for linenum, line in enumerate(output_contents.split('\n')):
    if line.startswith(' '):
        bad_words = []
        for word in line.split():
            for char in string.punctuation:
                word = word.replace(char, '')
            if word.lower() not in english_words:
                bad_words.append(word)
        if bad_words:
            print(f"{linenum}) {bad_words}")

# lines = output_contents.split('\n')

# # Find lines with non-English words
# non_english_lines = filter_non_english_lines(lines)

# # Output the results
# for line in non_english_lines:
#     print(f"Non-English line: {line}")

812) ['sanzen']
901) ['Im']
962) ['Roshi']
1210) ['aliveness', 'thats', 'Whats']
1226) ['socalled']
1249) ['isnt']
1347) ['Yn', 'yr', 'hyn', 'rydym', 'yn', 'ei', 'wneud', 'yma', 'rydyn', 'nin', 'ir', 'ffeil', 'dyrwodol', 'syn', 'ddim', 'yn', 'organig', 'neu', 'mecanig']
1417) ['nittygritty', 'weve']
1487) ['gallwn', 'ni', 'peth', 'cyntaf', 'yw', 'gofn', 'yn', 'fictitio', 'gofn', 'yw', 'cymdeithas', 'neu', 'ffyrdd', 'hunain', 'chyflwynor', 'ystradd', 'ymgysylltuol', 'yn', 'ymgysylltuol', 'amser', 'wneud', 'cymdeithas', 'yn', 'effeithiol', 'gysylltu', 'ymgyrch', 'gysylltu', 'gyrch', 'gysylltuaur', 'amser', 'amser']
1624) ['Im', 'hocuspocus']
1644) ['Clio', 'fel', 'ydych', 'cael', 'fyny', 'llwyth', 'maen', 'llwyth', 'yn', 'ymwneud', 'âr', 'llwyth', 'ddyn']
1706) ['Im', 'Im']
1968) ['Im', 'koto']
2256) ['nonexistence']
3178) ['youve', 'Koyasan', 'Vajrayana', 'Mahayana', 'havent', 'theyre']
3709) ['Bodhisattva', 'Mahayana', 'antiworldliness', 'nonBuddha']
3716) ['reemphasizing', 'dont', 'do

In [None]:
with open('data/output.txt', 'r') as in:
    with open('data/output_cleaned.txt', 'w') as out:
        
    

### TODO: Incorporate Written Works

# Train Language Model

# Implement Chatbot

# Text-to-Speech Synthesis

# Combine Audio and Play