In [1]:
import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from scipy.stats import t as t_distribution
from scipy.stats import chi2

# Sample corpus
corpus = ["the quick brown fox jumps over the lazy dog",
          "she sells sea shells by the sea shore",
          "how much wood would a woodchuck chuck if a woodchuck could chuck wood",
          "peter piper picked a peck of pickled peppers",
          "how can a clam cram in a clean cream can",
          "I scream you scream we all scream for ice cream",
          "the quick  fox was brown and seen near the river",
          "the quick brown dog chased the cat up the tree",
          "the quick fox and the quick brown dog are best friends",
          "the quick fox jumps over the lazy brown dog"]

# Function to create bigram windows
def create_bigrams(tokens):
    return list(nltk.bigrams(tokens))

# Function to count occurrences of a specific bigram
def count_bigram(bigrams, word1, word2):
    return sum(1 for bigram in bigrams if bigram[0] == word1 and bigram[1] == word2)

# Function to calculate T-test
def calculate_t_test(observed, expected, variance, n):
    t_value = (observed - expected) / (variance / n)**0.5
    return t_value

# Function to calculate Chi-square test
def calculate_chi_square(observed, expected):
    chi_square = ((observed - expected)**2) / expected
    return chi_square

# Select two words for collocation
word1 = "quick"
word2 = "brown"

# Tokenize the corpus
tokens = nltk.word_tokenize(" ".join(corpus))

# Create bigrams
bigrams = create_bigrams(tokens)

# Count occurrences of selected word pair
bigram_freq = count_bigram(bigrams, word1, word2)

# Total number of bigrams in the corpus
total_bigrams = len(bigrams)

# Calculate expected frequency assuming independence
expected_freq = (tokens.count(word1) / len(tokens)) * (tokens.count(word2) / len(tokens)) 

# Calculate variance under independence assumption
variance = expected_freq * (1 - tokens.count(word1) / len(tokens)) * (1 - tokens.count(word2) / len(tokens))

# Calculate T-test
t_value = calculate_t_test(bigram_freq, expected_freq, variance, total_bigrams)

# Calculate Chi-square test
chi_square_value = calculate_chi_square(bigram_freq, expected_freq)

# Degrees of freedom for Chi-square test
degrees_of_freedom = 1

# Critical values for T-test and Chi-square test
t_critical_value = t_distribution.ppf(0.975, total_bigrams - 1)
chi_square_critical_value = chi2.ppf(0.95, degrees_of_freedom)

# Print results
print(f"T-value: {t_value}")
print(f"T-critical value: {t_critical_value}")
print(f"Chi-square value: {chi_square_value}")
print(f"Chi-square critical value: {chi_square_critical_value}")

T-value: 559.5138233442708
T-critical value: 1.984984311431769
Chi-square value: 2875.2031236984585
Chi-square critical value: 3.841458820694124
