## Math 3790 W01
## Yuge Xu 1194170
## NLP Method: Frequency, JKF, Mean-Variance

## Import Libraries

In [16]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk import bigrams
import pandas as pd
import numpy as np
from nltk import pos_tag
from collections import Counter


## Load Data

In [17]:
# read txt files
with open("Corpus_Country_History.txt", "r", encoding="utf-8") as file:
    text = file.read()
# If a sentence has a break line inside it, we need to clear the line  in this way
text = text.replace('\n', ' ')


## Data Preprocessing Tokenization

In [18]:
# separate words
words = word_tokenize(text)
# filter out punctuation marks
words = [word for word in words if word.isalpha()]
# separate sentences
sentences = sent_tokenize(text)
print(f"the corpus has total {len(sentences)} sentences")
print(f"the corpus has total {len(words)} words")

the corpus has total 37715 sentences
the corpus has total 755386 words


## Collocation Analysis Method 1. Frequency-Based Method

## Count the frequency of words & Count the frequency of bigrams

In [19]:
freq_dist = FreqDist(words)
most_common_words = freq_dist.most_common(10)
print(f"See the top 10 most common words {most_common_words}")
print()
#create a bigram list to store all the bigrams
bigram_list = list(bigrams(words))
print(f"the length of bigramlist = {len(bigram_list)}")
print()
#Use FreqDist to count the occurrence of bigrams
freq_dist = FreqDist(bigram_list)
most_common_bigrams = freq_dist.most_common(10)
print(f"See the top 10 most common bigrams {most_common_bigrams}")
print()
# Check those bigrams that contains "world",since our corpus is doing something related to world history
bigrams_contains_world = [bigram for bigram in bigram_list if "world" in [word.lower() for word in bigram]]
bigram_counter = Counter(bigrams_contains_world)
# Find the top 10 most frequent bigrams that contains "world"
top_10_bigrams = bigram_counter.most_common(10)
print(f"find top 10 most frequent bigrams that contains 'world' with their occurrence number = {top_10_bigrams}")


See the top 10 most common words [('the', 53110), ('of', 30031), ('and', 20646), ('to', 20394), ('in', 17030), ('a', 13949), ('was', 9074), ('that', 7986), ('for', 5810), ('The', 5251)]

the length of bigramlist = 755385

See the top 10 most common bigrams [(('of', 'the'), 8827), (('in', 'the'), 4855), (('to', 'the'), 3009), (('and', 'the'), 1875), (('on', 'the'), 1677), (('by', 'the'), 1392), (('that', 'the'), 1321), (('for', 'the'), 1314), (('with', 'the'), 1226), (('from', 'the'), 1121)]

find top 10 most frequent bigrams that contains 'world' with their occurrence number = [(('the', 'world'), 373), (('World', 'War'), 164), (('New', 'World'), 55), (('the', 'World'), 46), (('world', 'and'), 36), (('Second', 'World'), 32), (('a', 'world'), 30), (('First', 'World'), 30), (('world', 'of'), 29), (('world', 'The'), 20)]


## Collocation Analysis Method 2. JKF 

In [20]:
# tag each word in corpus
pos_tags = pos_tag(words)
freq_dist = FreqDist(pos_tags)
# See the top 10 most frequently tagged words
most_common_words = freq_dist.most_common(10)
print(f"See the top 10 most frequently tagged words = {most_common_words}")

# Then use the part-of-speech tag to filter for our desired bigrams
bigram_list = list(bigrams(pos_tags))
# For example, Filter out bigrams where the first word is an adjective (JJ) and the second word is a noun (NN*)
JJ_NN_bigrams = [(word1, word2) for (word1, tag1), (word2, tag2) in bigram_list if tag1.startswith('JJ') and tag2.startswith('NN') and len(word1) > 1 and len(word2) > 1]
print(f"We matched {len(JJ_NN_bigrams)} bigrams where the first word in bigram is adj, the second word in bigram is noun ")

# See the top 10 most frequently appeared bigrams that composed of a adj word and a noun
freq_dist = FreqDist(JJ_NN_bigrams)
for bigram, frequency in freq_dist.most_common(10):
    print(f"{bigram}: {frequency}")
print()
## Find the top 10 most frequent nouns
# Extract noun word
nouns = [word for word, pos in pos_tags if pos.startswith('NN') and len(word) > 1]
# Count their frequency
noun_freq = Counter(nouns)
top_nouns = noun_freq.most_common(10)
print(f"Get the top 10 most frequent nouns = {top_nouns}")

See the top 10 most frequently tagged words = [(('the', 'DT'), 53110), (('of', 'IN'), 30031), (('and', 'CC'), 20646), (('to', 'TO'), 20394), (('in', 'IN'), 17030), (('a', 'DT'), 13949), (('was', 'VBD'), 9074), (('for', 'IN'), 5810), (('that', 'IN'), 5438), (('The', 'DT'), 5251)]
We matched 37602 bigrams where the first word in bigram is adj, the second word in bigram is noun 
('nineteenth', 'century'): 170
('same', 'time'): 160
('twentieth', 'century'): 104
('other', 'hand'): 80
('white', 'man'): 66
('economic', 'growth'): 62
('white', 'men'): 56
('civil', 'rights'): 49
('first', 'time'): 47
('gold', 'standard'): 43

Get the top 10 most frequent nouns = [('people', 1505), ('war', 1192), ('time', 1168), ('New', 1133), ('King', 1001), ('men', 1000), ('government', 964), ('England', 960), ('years', 871), ('country', 781)]


## Collocation Analysis Method 3.Mean Variance Method

## T1 = world, T2 = government, Find its mean value, varaince value

In [21]:
# Specify the target keywords
keywords = ["world", "government"]
# Find and print sentences that contain keywords, ignoring uppercase/lower case
target_sentences = [sentence for sentence in sentences if all(keyword.lower() in sentence.lower() for keyword in keywords)]
for sentence in target_sentences:
    print(sentence)
    print()
print(f"matched target sentence = {len(target_sentences)}")

It then went on to list grievances against the king, “a history of repeated injuries and usurpations, all having in direct object the establishment of an absolute Tyranny over these States.” The list accused the king of dissolving colonial governments, controlling judges, sending “swarms of Officers to harass our people,” sending in armies of occupation, cutting off colonial trade  with other parts of the world, taxing the colonists without their consent, and waging war against them, “transporting large Armies of foreign Mercenaries to compleat the works of death, desolation and tyranny.” All this, the language of popular control over governments, the right of rebellion and revolution, indignation at political tyranny, economic burdens, and military attacks, was language well suited to unite large numbers of colonists, and persuade even those who had grievances against one another to turn against England.

This was a larger base of support for government than anywhere in the world at 

In [22]:
def calculate_keyword_distances(sentences, keywords):

    # Convert the keyword to lowercase
    keywords = [keyword.lower() for keyword in keywords]
    
    # Find thoses sentences that contain keywords, ignoring case
    target_sentences = [sentence for sentence in sentences if all(keyword in sentence.lower() for keyword in keywords)]
    
    # define a list of results to store the distances
    keyword_distances = []

    # Find and print sentences that contain keywords, ignoring case, and calculate the abs distance between keywords
    for sentence in target_sentences:
        # separate words in sentences, and get the index for each word
        words = word_tokenize(sentence.lower())

        try:
            # find the index for keyword1, find index for keyword2
            keyword1_indices = [i for i, word in enumerate(words) if word == keywords[0]]
            keyword2_indices = [i for i, word in enumerate(words) if word == keywords[1]]

            # get the distance between keyword1 and keyword 2 using index
            distances = [abs(i - j) for i in keyword1_indices for j in keyword2_indices]
            min_distance = min(distances)

            # store extracted information in the keyword_distance Structure
            keyword_distances.append({
                "sentence": sentence,
                keywords[0] + "_indices": keyword1_indices,
                keywords[1] + "_indices": keyword2_indices,
                "distance": min_distance
            })
        except ValueError:
            # If there is no two keywords matched in the sentence, skip this sentence
            continue

    # extract all distances from those sentences contains keyword1 and keyword2
    distances = [item["distance"] for item in keyword_distances]
    size_of_matched_sentence = len(keyword_distances)
    # use numpy to find mean and varaince and standard deviation
    mean_distance = np.mean(distances)
    variance = np.var(distances)
    standard_deviation = np.std(distances)
    # Display the result
    print(f"keywords[0] = {keywords[0]}   keywords[1] = {keywords[1]}")
    print(f"matched sentence number = {size_of_matched_sentence}")
    print("Mean Distance:", mean_distance)
    print("Variance:", variance)
    print(f"Standard Deviation = {standard_deviation}")
    print("****************")
    print()

# A Test Sentence
test_sentences = [
    "Hello world! This is a test. The world is beautiful.",
    "Hello again, world. Beautiful world, isn't it? World peace is important.",
    "Hello this is Hello World in the environment of World",
    "Hello 2 3 4 5 World Hello",
    "United States Hello States Hello in the big World with Hello 2 WoRld"
]
keywords = ["Hello", "World"]

calculate_keyword_distances(test_sentences, keywords)
dis = [1,3,1,1,2]
print(f"mean = {np.mean(dis)}")
print(f"var = {np.var(dis)}")

keywords[0] = hello   keywords[1] = world
matched sentence number = 5
Mean Distance: 1.6
Variance: 0.64
Standard Deviation = 0.8
****************

mean = 1.6
var = 0.64


In [23]:
calculate_keyword_distances(sentences, ["world", "war"]) 
calculate_keyword_distances(sentences, ["world", "power"])
calculate_keyword_distances(sentences, ["world", "government"])
calculate_keyword_distances(sentences,["world","organization"])
calculate_keyword_distances(sentences, ["world", "trade"])
calculate_keyword_distances(sentences, ["world", "people"])
calculate_keyword_distances(sentences, ["world", "economy"])

keywords[0] = world   keywords[1] = war
matched sentence number = 166
Mean Distance: 4.054216867469879
Variance: 70.40067498911309
Standard Deviation = 8.390511008819015
****************

keywords[0] = world   keywords[1] = power
matched sentence number = 30
Mean Distance: 14.766666666666667
Variance: 188.11222222222221
Standard Deviation = 13.715400913652587
****************

keywords[0] = world   keywords[1] = government
matched sentence number = 24
Mean Distance: 19.291666666666668
Variance: 202.20659722222226
Standard Deviation = 14.219936611047965
****************

keywords[0] = world   keywords[1] = organization
matched sentence number = 15
Mean Distance: 28.666666666666668
Variance: 2767.288888888889
Standard Deviation = 52.60502722068385
****************

keywords[0] = world   keywords[1] = trade
matched sentence number = 58
Mean Distance: 20.413793103448278
Variance: 3965.3804994054703
Standard Deviation = 62.971267252656354
****************

keywords[0] = world   keywords[1] 