In [None]:
def load_text(filename):
    """
    Loads the content of a text file with the specified filename.
    
    Args:
    - filename (str): The path to the text file to be loaded.
    
    Returns:
    - str: The content of the file.
    """
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Use the function to load the text from the file
file_path = 'pride_and_prejudice.txt'
loaded_text = load_text(file_path)

# Check the length of the loaded text and print a small portion to confirm it's loaded correctly
len(loaded_text), loaded_text[:1000]


(726868,
 '\n_Walt Whitman has somewhere a fine and just distinction between “loving\nby allowance” and “loving with personal love.” This distinction applies\nto books as well as to men and women; and in the case of the not very\nnumerous authors who are the objects of the personal affection, it\nbrings a curious consequence with it. There is much more difference as\nto their best work than in the case of those others who are loved “by\nallowance” by convention, and because it is felt to be the right and\nproper thing to love them. And in the sect--fairly large and yet\nunusually choice--of Austenians or Janites, there would probably be\nfound partisans of the claim to primacy of almost every one of the\nnovels. To some the delightful freshness and humour of_ Northanger\nAbbey, _its completeness, finish, and_ entrain, _obscure the undoubted\ncritical facts that its scale is small, and its scheme, after all, that\nof burlesque or parody, a kind in which the first rank is reached with\nd

In [None]:
def replace_chars(text, chars_to_replace, replacement_char=' '):
    """
    Replaces specified characters in a given text with a replacement character.
    
    
    Returns:
    - str: The processed text with specified characters replaced.
    """
    for char in chars_to_replace:
        text = text.replace(char, replacement_char)
    return text

# Example usage of the function
example_text = "Hello, World! 123."
chars_to_replace = ",.!"
# Replacing specified characters with a space
processed_text = replace_chars(example_text, chars_to_replace, ' ')
processed_text


'Hello  World  123 '

In [None]:
def preprocess_text(text, chars_to_replace):
    """
    Preprocesses the given text by replacing specified characters with spaces,
    converting to lower case, and splitting into a list of words.
    
    
    Returns:
    - list: A list of words from the processed text.
    """
    # Replace specified characters with spaces
    processed_text = replace_chars(text, chars_to_replace)
    # Convert text to lower case
    processed_text = processed_text.lower()
    # Split text into words
    words = processed_text.split()
    return words

# Characters to replace with spaces (standard punctuation and numbers)
chars_to_replace = ".,!?:;\"'-()[]{}<>@#$%^&*—1234567890\n"

# Preprocess the loaded text to get a list of words
words_list = preprocess_text(loaded_text, chars_to_replace)

# Check the first 20 words to confirm the process
words_list


['_walt',
 'whitman',
 'has',
 'somewhere',
 'a',
 'fine',
 'and',
 'just',
 'distinction',
 'between',
 '“loving',
 'by',
 'allowance”',
 'and',
 '“loving',
 'with',
 'personal',
 'love',
 '”',
 'this',
 'distinction',
 'applies',
 'to',
 'books',
 'as',
 'well',
 'as',
 'to',
 'men',
 'and',
 'women',
 'and',
 'in',
 'the',
 'case',
 'of',
 'the',
 'not',
 'very',
 'numerous',
 'authors',
 'who',
 'are',
 'the',
 'objects',
 'of',
 'the',
 'personal',
 'affection',
 'it',
 'brings',
 'a',
 'curious',
 'consequence',
 'with',
 'it',
 'there',
 'is',
 'much',
 'more',
 'difference',
 'as',
 'to',
 'their',
 'best',
 'work',
 'than',
 'in',
 'the',
 'case',
 'of',
 'those',
 'others',
 'who',
 'are',
 'loved',
 '“by',
 'allowance”',
 'by',
 'convention',
 'and',
 'because',
 'it',
 'is',
 'felt',
 'to',
 'be',
 'the',
 'right',
 'and',
 'proper',
 'thing',
 'to',
 'love',
 'them',
 'and',
 'in',
 'the',
 'sect',
 'fairly',
 'large',
 'and',
 'yet',
 'unusually',
 'choice',
 'of',
 'aust

In [None]:
def count_word_frequency(words_list):
    """
    Counts the frequency of each word in the given list of words.
    
    
    Returns:
    - dict: A dictionary with words as keys and their frequency counts as values.
    """
    word_frequency = {}
    for word in words_list:
        if word in word_frequency:
            word_frequency[word] += 1
        else:
            word_frequency[word] = 1
    return word_frequency

# Count the frequency of each word in the preprocessed list of words
word_frequency_dict = count_word_frequency(words_list)

# To display a portion of the dictionary, let's show the first 10 items
list(word_frequency_dict.items())


[('_walt', 1),
 ('whitman', 1),
 ('has', 248),
 ('somewhere', 6),
 ('a', 2020),
 ('fine', 33),
 ('and', 3664),
 ('just', 77),
 ('distinction', 7),
 ('between', 67),
 ('“loving', 2),
 ('by', 687),
 ('allowance”', 2),
 ('with', 1094),
 ('personal', 6),
 ('love', 101),
 ('”', 1759),
 ('this', 435),
 ('applies', 2),
 ('to', 4300),
 ('books', 14),
 ('as', 1213),
 ('well', 210),
 ('men', 39),
 ('women', 22),
 ('in', 1976),
 ('the', 4622),
 ('case', 35),
 ('of', 3830),
 ('not', 1480),
 ('very', 492),
 ('numerous', 4),
 ('authors', 1),
 ('who', 293),
 ('are', 358),
 ('objects', 8),
 ('affection', 61),
 ('it', 1523),
 ('brings', 3),
 ('curious', 6),
 ('consequence', 32),
 ('there', 348),
 ('is', 900),
 ('much', 334),
 ('more', 334),
 ('difference', 13),
 ('their', 451),
 ('best', 46),
 ('work', 22),
 ('than', 298),
 ('those', 64),
 ('others', 61),
 ('loved', 11),
 ('“by', 9),
 ('convention', 2),
 ('because', 60),
 ('felt', 105),
 ('be', 1262),
 ('right', 43),
 ('proper', 22),
 ('thing', 30),
 (

In [None]:
def print_top_words(word_frequency_dict, n=10):
    """
    Gets the top n words by frequency from the word frequency dictionary.
    
    
    Returns:
    - list: A list of tuples containing the top n words and their frequencies, sorted by frequency in descending order.
    """
    # Convert dictionary to list of tuples and sort by frequency in descending order
    sorted_word_freq = sorted(word_frequency_dict.items(), key=lambda item: item[1], reverse=True)
    return sorted_word_freq[:n]

# Get the top 10 words by frequency
top_words = print_top_words(word_frequency_dict, 10)

# Display the result in a table-like format
top_words_table = f"{'Word':<20}{'Frequency':<10}\n" + "\n".join([f"{word:<20}{count:<10}" for word, count in top_words])
print(top_words_table)


Word                Frequency 
the                 4622      
to                  4300      
of                  3830      
and                 3664      
her                 2258      
a                   2020      
in                  1976      
was                 1867      
i                   1792      
”                   1759      


In [None]:
text = load_text('pride_and_prejudice.txt')
words = preprocess_text(text, chars_to_replace)
word_count = count_word_frequency(words)
print_top_words(word_count, 50)

[('the', 4622),
 ('to', 4300),
 ('of', 3830),
 ('and', 3664),
 ('her', 2258),
 ('a', 2020),
 ('in', 1976),
 ('was', 1867),
 ('i', 1792),
 ('”', 1759),
 ('she', 1710),
 ('that', 1546),
 ('it', 1523),
 ('not', 1480),
 ('he', 1315),
 ('his', 1278),
 ('be', 1262),
 ('you', 1247),
 ('as', 1213),
 ('had', 1179),
 ('with', 1094),
 ('for', 1075),
 ('but', 951),
 ('is', 900),
 ('have', 872),
 ('at', 810),
 ('mr', 791),
 ('him', 763),
 ('on', 728),
 ('by', 687),
 ('my', 657),
 ('all', 641),
 ('elizabeth', 604),
 ('so', 601),
 ('they', 592),
 ('were', 567),
 ('which', 563),
 ('been', 534),
 ('could', 528),
 ('from', 504),
 ('very', 492),
 ('would', 480),
 ('no', 477),
 ('what', 454),
 ('their', 451),
 ('me', 441),
 ('this', 435),
 ('them', 435),
 ('your', 416),
 ('will', 415)]

In [None]:
def create_bigrams(text):
    bigrams = {}
    # clean up text - remove punctuation (except full stop) and make all lower case
    text = text.lower().replace('\n', ' ')
    chars_to_remove = ",!?:;\"'-()[]{}<>@#$%^&*—1234567890"
    for char in chars_to_remove:
        text = text.replace(char, ' ')
    text = text.replace(' .', '.')
    
    # split into sentences - and add the full stop back to the end of each sentence after splitting
    sentences = [sentence.strip() + '.' for sentence in text.split('.') if sentence]
    
    # iterate each word in the sentence
    for sentence in sentences:
        words = sentence.split()
        for i in range(len(words) - 1):
            current_word = words[i]
            next_word = words[i + 1]
            if current_word not in bigrams:
                bigrams[current_word] = {}
            if next_word not in bigrams[current_word]:
                bigrams[current_word][next_word] = 1
            else:
                bigrams[current_word][next_word] += 1
    
    # convert each word's dictionary to an ordered list of tuples
    for word in bigrams:
        bigrams[word] = sorted(bigrams[word].items(), key=lambda item: item[1], reverse=True)
    
    return bigrams

# Apply the function to create bigrams from the loaded text
bigrams_dict = create_bigrams(loaded_text)

# Display a portion of the bigrams dictionary to confirm
list(bigrams_dict.items())


[('_walt', [('whitman', 1)]),
 ('whitman', [('has', 1)]),
 ('has',
  [('been', 56),
   ('not', 18),
   ('no', 9),
   ('she', 6),
   ('made', 5),
   ('done', 5),
   ('ever', 4),
   ('taken', 4),
   ('a', 4),
   ('the', 4),
   ('never', 3),
   ('given', 3),
   ('always', 3),
   ('often', 3),
   ('happened', 3),
   ('got', 3),
   ('just', 2),
   ('of', 2),
   ('two', 2),
   ('since', 2),
   ('nothing', 2),
   ('any', 2),
   ('deserved', 2),
   ('received', 2),
   ('lived', 2),
   ('neither', 2),
   ('done.', 2),
   ('he', 2),
   ('now', 2),
   ('passed', 2),
   ('in', 2),
   ('somewhere', 1),
   ('devotees', 1),
   ('almost', 1),
   ('shown', 1),
   ('knowledge', 1),
   ('truly', 1),
   ('justly', 1),
   ('put', 1),
   ('probably', 1),
   ('sometimes', 1),
   ('by', 1),
   ('five', 1),
   ('something', 1),
   ('promised', 1),
   ('known', 1),
   ('rather', 1),
   ('without', 1),
   ('merely', 1),
   ('preferred', 1),
   ('one', 1),
   ('prevented', 1),
   ('deprived', 1),
   ('connected',

In [None]:
from random import randint

WR = 15
first_word = input("Enter the first word: ").lower()
sentence = [first_word]

while sentence[-1] != '.':
    options = bigrams_dict.get(sentence[-1], [])
    if not options:  # If there are no options, possibly end the sentence or handle differently
        break
    # Selecting the next word with a bias towards higher frequency words but limited by WR
    sel = randint(0, len(options)-1) if len(options) < WR else randint(0, WR-1)
    next_word = options[sel][0]  # Corrected variable name from 'next' to 'next_word' for clarity
    
    sentence.append(next_word)

print(' '.join(sentence))


go for_ pride if not but to me and he did at pemberley house that your mother should you know who can help laughing alarm.


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6cd00808-406a-4129-ab29-05b4bbe2f656' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>