In [27]:
import re
import nltk
from nltk import word_tokenize
from nltk.util import ngrams

with open("Tarzan.txt", "r", encoding="utf-8") as file:
    corpus = file.read()
clean_text = re.sub(r"[^\w\s]", "", corpus).replace('\ufeff', '')
token = word_tokenize(clean_text, language='english')
bigrams = list(ngrams(token, 2))
freq= nltk.FreqDist(bigrams)
top_10_pairs = freq.most_common(10)
print(top_10_pairs)


[(('of', 'the'), 958), (('in', 'the'), 347), (('to', 'the'), 306), (('Ibn', 'Jad'), 226), (('and', 'the'), 189), (('upon', 'the'), 153), (('from', 'the'), 149), (('he', 'had'), 135), (('of', 'his'), 133), (('that', 'he'), 121)]


In [80]:
import numpy as np
import pandas as pd

word_freq = nltk.FreqDist(token)
unique_words = list(set(token))
co_occurrence_matrix = np.zeros((len(unique_words), len(unique_words)), dtype=float)
for i, word1 in enumerate(unique_words):
    for j, word2 in enumerate(unique_words):
        if i == j:
            co_occurrence_matrix[i, j] = 0.01
        co_occurrence_matrix[i, j] = freq.get((word1, word2), 0.01)
co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=unique_words, columns=unique_words)

In [81]:
ten_words_co_occurrence_df_smoothed = co_occurrence_df.iloc[:5, :5]
print(ten_words_co_occurrence_df_smoothed)

             lances  wealth  novelty  wheresoever  void
lances         0.01    0.01     0.01         0.01  0.01
wealth         0.01    0.01     0.01         0.01  0.01
novelty        0.01    0.01     0.01         0.01  0.01
wheresoever    0.01    0.01     0.01         0.01  0.01
void           0.01    0.01     0.01         0.01  0.01


In [82]:
prob_co_occurrence_matrix = np.zeros((len(unique_words), len(unique_words)), dtype=float)

for i, word in enumerate(unique_words):
    word_total = co_occurrence_matrix[i, :].sum() + len(unique_words)
    prob_co_occurrence_matrix[i, :] = co_occurrence_matrix[i, :] / word_total

prob_co_occurrence_df = pd.DataFrame(prob_co_occurrence_matrix, index=unique_words, columns=unique_words)
ten_words_prob_co_occurrence_df = prob_co_occurrence_df.iloc[:5, :5]
print(ten_words_prob_co_occurrence_df)

               lances    wealth   novelty  wheresoever      void
lances       0.000001  0.000001  0.000001     0.000001  0.000001
wealth       0.000001  0.000001  0.000001     0.000001  0.000001
novelty      0.000001  0.000001  0.000001     0.000001  0.000001
wheresoever  0.000001  0.000001  0.000001     0.000001  0.000001
void         0.000001  0.000001  0.000001     0.000001  0.000001


In [83]:
next_word_after_i = prob_co_occurrence_df.loc['I'].idxmax()
highest_probability_after_i = prob_co_occurrence_df.loc['I'].max()
print(f'The word most likely to follow "I" is "{next_word_after_i}" with a probability of {highest_probability_after_i:.4f}')

next_word_after_want = prob_co_occurrence_df.loc['want'].idxmax()
highest_probability_after_want = prob_co_occurrence_df.loc['want'].max()
print(f'The word most likely to follow "want" is "{next_word_after_want}" with a probability of {highest_probability_after_want:.4f}')

The word most likely to follow "I" is "am" with a probability of 0.0086
The word most likely to follow "want" is "to" with a probability of 0.0014


In [91]:
import random
sentence = "For half a day he lolled on the huge back and"
sentence_tokens = word_tokenize(sentence.lower())
for _ in range(10):
    last_word = sentence_tokens[-1]
    if last_word in prob_co_occurrence_df.index:
        next_word_probs = prob_co_occurrence_df.loc[last_word]
        top_next_words = next_word_probs.nlargest(10)
        next_word = random.choice(top_next_words.index.tolist())
        sentence_tokens.append(next_word)
    else:
        break
extended_sentence = ' '.join(sentence_tokens)

print("Generated sentence (with random choice from top 10):", extended_sentence)

Generated sentence (with random choice from top 10): for half a day he lolled on the huge back and so they saw his sword was to them that it


In [93]:
import random
sentence = "For half a day he lolled on the huge back and"
sentence_tokens = sentence.lower().split()
for _ in range(10):
    last_word = sentence_tokens[-1]
    if last_word in prob_co_occurrence_df.index:
        next_word_probs = prob_co_occurrence_df.loc[last_word]
        next_word_probs /= next_word_probs.sum()
        potential_next_words = next_word_probs.index.tolist()
        probabilities = next_word_probs.values
        random_number = random.random()
        cumulative_probability = 0.0
        for i, probability in enumerate(probabilities):
            cumulative_probability += probability
            if random_number < cumulative_probability:
                next_word = potential_next_words[i]
                break
        sentence_tokens.append(next_word)
    else:
        break
    sentence = ' '.join(sentence_tokens)
print("Generated sentence:", sentence)

Generated sentence: for half a day he lolled on the huge back and it was gone battle space hat jest board stout patch
