In [3]:
import nltk
import math
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tag import pos_tag

# Open and read the corpus file
with open("corpus.txt", "r") as corpus_file:
    corpus_text = corpus_file.read()

# Tokenize the text
text_tokens = word_tokenize(corpus_text)
# Tag the tokens with their part-of-speech
tagged_tokens = pos_tag(text_tokens)

# Initialize lists to store nouns, verbs, and prepositions
nouns = []
verbs = []
prepositions = []

# Loop through tagged tokens to extract nouns, verbs, and prepositions
for token, tag in tagged_tokens:
    if tag.startswith('N'):  # Noun
        nouns.append(token)
    elif tag.startswith('V'):  # Verb
        verbs.append(token)
    elif tag == 'IN':  # Preposition
        prepositions.append(token)

# Initialize BigramCollocationFinder to find bigrams
bigram_finder = BigramCollocationFinder.from_words(text_tokens)
# Get all bigrams
bigrams = bigram_finder.ngram_fd.items()
# Get top bigrams by frequency
top_bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 10)

# Initialize counts for preposition followed by verb and noun
preposition_verb_count = 0
preposition_noun_count = 0

# Define desired bigrams for verb and noun attachment
desired_bigram_verb = ('fill', 'with')
desired_bigram_noun = ('traffic', 'with')

# Count occurrences of desired bigrams
for bigram, freq in bigrams:
    if bigram == desired_bigram_verb:
        preposition_verb_count += freq
    elif bigram == desired_bigram_noun:
        preposition_noun_count += freq

# Calculate verb count
verb_occurrences = corpus_text.count('fill')
# Calculate noun count
noun_occurrences = corpus_text.count('traffic')

# Calculate probabilities
prob_preposition_followed_by_noun = preposition_noun_count / noun_occurrences
prob_preposition_followed_by_verb = preposition_verb_count / verb_occurrences

# Print probabilities
print("Probability of preposition followed by noun:", prob_preposition_followed_by_noun)
print("Probability of preposition followed by verb:", prob_preposition_followed_by_verb)

# Calculate probability of preposition not followed by noun
prob_preposition_not_followed_by_noun = 1 - prob_preposition_followed_by_noun

# Calculate lambda
lamda = prob_preposition_followed_by_verb * prob_preposition_not_followed_by_noun / prob_preposition_followed_by_noun
lamda = math.log(lamda, 2)
print('Lambda is', lamda)

# Decide attachment with verb or noun based on lambda
if lamda > 0:
    print('Attached with verb')
else:
    print('Attached with noun')


Probability of preposition followed by noun: 0.16666666666666666
Probability of preposition followed by verb: 0.5
Lambda is 1.3219280948873626
Attached with verb
