In [17]:
import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from scipy.stats import t as t_distribution
from scipy.stats import chi2

# Sample corpus
corpus = """In the sprawling metropolis, where towering skyscrapers define the skyline and the hum of city life reverberates through the streets, the intersection of technology and human interaction is a recurring motif. The bustling financial district serves as a hub for innovation, where startups and established enterprises alike converge to shape the future. Within the glass and steel structures, a dynamic ecosystem of collaboration and disruption thrives, giving rise to novel solutions for contemporary challenges.

The city's vibrant multicultural neighborhoods are a testament to its diverse populace. In the historic district, centuries-old architecture stands juxtaposed with cutting-edge art galleries, creating a visual symphony that encapsulates the city's rich history and progressive spirit. The recurring theme of cultural fusion permeates the air, as languages, cuisines, and traditions intertwine, shaping a tapestry that reflects the interconnected lives of its inhabitants.

Amidst the fast paced urban rhythm, educational institutions serve as crucibles of knowledge and innovation. Research centers collaborate with industry leaders on projects that span artificial intelligence, renewable energy, and urban sustainability. The recurring pursuit of knowledge and the transformative power of education underscore the city's commitment to progress.

As day turns to night, the cityscape transforms into a glittering spectacle, with neon lights and digital billboards illuminating the streets. Entertainment districts come alive, offering a myriad of experiences, from avant-garde theater productions to cutting-edge virtual reality experiences. The recurring motif of technology-driven entertainment mirrors the city's penchant for pushing boundaries and embracing the future.

Public spaces, adorned with sculptures and interactive installations, provide a canvas for artistic expression. The recurring themes of public art and civic engagement come together in vibrant street murals that tell stories of resilience and unity. Citizens, connected by a shared appreciation for creativity, participate in cultural events that bridge the gap between diverse communities.

Within the sprawling parks and green oases that dot the urban landscape, a recurring commitment to environmental sustainability emerges. Community gardens, solar-powered charging stations, and eco-friendly architecture showcase the city's dedication to a harmonious coexistence with nature. The recurring practices of eco-conscious living and green initiatives underscore a collective responsibility toward the planet.

In the fast paced digital realm, social media platforms become arenas for discourse and connectivity. Hashtags, trending topics, and viral challenges form recurring patterns that reflect the pulse of societal conversations. The city's inhabitants, connected through virtual networks, contribute to the recurring narrative of online communities shaping real-world change.

In this metropolis of contrasts and convergences, the recurring collocations of innovation and tradition, diversity and unity, technology and humanity, weave a narrative that defines the city's identity. It is within this intricate tapestry that the residents find both individual expression and a collective sense of belonging, creating a cityscape that evolves with each passing day while honoring the recurring themes that anchor its essence.
"""

# Function to create bigram windows
def create_bigrams(tokens):
    return list(nltk.bigrams(tokens))

# Function to count occurrences of a specific bigram
def count_bigram(bigrams, word1, word2):
    return sum(1 for bigram in bigrams if bigram[0] == word1 and bigram[1] == word2)

# Function to calculate T-test
def calculate_t_test(observed, expected, variance, n):
    t_value = (observed - expected) / (variance / n)**0.5
    return t_value

# Function to calculate Chi-square test
def calculate_chi_square(observed, expected):
    chi_square = ((observed - expected)**2) / expected
    return chi_square

# Select two words for collocation
word1 = "cityscape"
word2 = "transforms"

# Tokenize the corpus
tokens = nltk.word_tokenize(corpus)

# Create bigrams
bigrams = create_bigrams(tokens)

# Count occurrences of selected word pair
bigram_freq = count_bigram(bigrams, word1, word2)

# Total number of bigrams in the corpus
total_bigrams = len(bigrams)

# Calculate expected frequency assuming independence
expected_freq = (tokens.count(word1) / len(tokens)) * (tokens.count(word2) / len(tokens)) 

# Calculate variance under independence assumption
variance = expected_freq * (1 - tokens.count(word1) / len(tokens)) * (1 - tokens.count(word2) / len(tokens))

# Calculate T-test
t_value = calculate_t_test(bigram_freq, expected_freq, variance, total_bigrams)

# Calculate Chi-square test
chi_square_value = calculate_chi_square(bigram_freq, expected_freq)

# Degrees of freedom for Chi-square test
degrees_of_freedom = 1

# Critical values for T-test and Chi-square test
t_critical_value = t_distribution.ppf(0.975, total_bigrams - 1)
chi_square_critical_value = chi2.ppf(0.95, degrees_of_freedom)

# Print results
print(f"T-value: {t_value}")
print(f"T-critical value: {t_critical_value}")
if t_value>t_critical_value:
    print("----------> According to t-test: Collocation")
else:
    print("----------> According to t-test: Not Collocation")
print()
print(f"Chi-square value: {chi_square_value}")
print(f"Chi-square critical value: {chi_square_critical_value}")
if chi_square_value>chi_square_critical_value:
    print("----------> According to chi-square test: Collocation")
else:
    print("----------> According to chi-square test: Not Collocation")


T-value: 8619.616319987514
T-critical value: 1.964475628237745
----------> According to t-test: Collocation

Chi-square value: 139918.5000071469
Chi-square critical value: 3.841458820694124
----------> According to chi-square test: Collocation
