<a href="https://colab.research.google.com/github/asalva15/memoire_hn1_illuminati_GIT/blob/main/Collocations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://github.com/ahegel/collocations/blob/master/get_collocations27.py

In [None]:
#!/usr/bin/env python2.7 version originale du code

"""
This program finds collocations in a corpus of text. It can find both the
overall top collocations, and the collocations of keywords you enter manually.
"""

import string
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures
from nltk.corpus import stopwords


# find collocations for each word
def get_collocations(corpus, windowsize=10, numresults=10):
    '''This function uses the Natural Language Toolkit to find the top collocations in a corpus.
    It takes as an argument a string that contains the corpus you want to
    find collocations from. It prints the top collocations it finds.
    '''
    # convert the corpus (a string) into  a list of words
    tokens = word_tokenize(corpus)
    # initialize the bigram association measures object to score each collocation
    bigram_measures = BigramAssocMeasures()
    # initialize the bigram collocation finder object to find and rank collocations
    finder = BigramCollocationFinder.from_words(tokens, window_size=windowsize)
    # apply a series of filters to narrow down the collocation results
    ignored_words = stopwords.words('english')
    finder.apply_word_filter(lambda w: len(w) < 2 or w.lower() in ignored_words)
    finder.apply_freq_filter(1)
    # calculate the top results by T-score
    # list of all possible measures: .raw_freq, .pmi, .likelihood_ratio, .chi_sq, .phi_sq, .fisher, .student_t, .mi_like, .poisson_stirling, .jaccard, .dice
    results = finder.nbest(bigram_measures.student_t, numresults)
    # print the results
    print "Top " + str(numresults) + " collocations:"
    for k, v in results:
        print str(k) + ", " + str(v)


def get_keyword_collocations(corpus, keyword, windowsize=10, numresults=10):
    '''This function uses the Natural Language Toolkit to find collocations
    for a specific keyword in a corpus. It takes as an argument a string that
    contains the corpus you want to find collocations from. It prints the top
    collocations it finds for each keyword.
    '''
    # convert the corpus (a string) into  a list of words
    tokens = word_tokenize(corpus)
    # initialize the bigram association measures object to score each collocation
    bigram_measures = BigramAssocMeasures()
    # initialize the bigram collocation finder object to find and rank collocations
    finder = BigramCollocationFinder.from_words(tokens, window_size=windowsize)
    # initialize a function that will narrow down collocates that don't contain the keyword
    keyword_filter = lambda *w: keyword not in w
    # apply a series of filters to narrow down the collocation results
    ignored_words = stopwords.words('english')
    finder.apply_word_filter(lambda w: len(w) < 2 or w.lower() in ignored_words)
    finder.apply_freq_filter(1)
    finder.apply_ngram_filter(keyword_filter)
    # calculate the top results by T-score
    # list of all possible measures: .raw_freq, .pmi, .likelihood_ratio, .chi_sq, .phi_sq, .fisher, .student_t, .mi_like, .poisson_stirling, .jaccard, .dice
    results = finder.nbest(bigram_measures.student_t, numresults)
    # print the results
    print "Top collocations for " + str(keyword) + ":"
    collocations = ''
    for k, v in results:
        if k != keyword:
            collocations += k + ' '
        else:
            collocations += v + ' '
    print collocations + '\n'


# Replace this with your filename
infile = "sample_corpus.txt"

# Read in the corpus you want to find collocations from
with open(infile) as tmpfile:
    data = tmpfile.read()

# Clean the data
data = data.translate(None, string.punctuation)  # remove punctuation
data = "".join(i for i in data if ord(i) < 128)  # remove non-ascii characters

# Get the top collocations for the entire corpus
get_collocations(data)
print ' '

# Replace this with a list of keywords you want to find collocations for
words_of_interest = ["love", "death"]

# Get the top collocations for each keyword in the list above
for word in words_of_interest:
    get_keyword_collocations(data, word)

In [3]:
from google.colab import drive
drive.mount('/content/drive')
corpus = '/content/drive/My Drive/MemoireHN1/corpus17761850'

Mounted at /content/drive


In [9]:
import os

# Function to read contents of a directory and concatenate text files into one string
def concatenate_text_files(directory):
    # Initialize an empty string to store the concatenated text
    concatenated_text = ""

    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        # Check if the file is a text file
        if filename.endswith(".txt"):
            # Read the contents of the text file and append it to the concatenated text
            with open(os.path.join(directory, filename), 'r') as file:
                concatenated_text += file.read()

    # Return the concatenated text
    return concatenated_text

# Specify the directory containing the text files
directory_path = "/content/drive/My Drive/MemoireHN1/corpus17761850"

# Call the function to concatenate text files
concatenated_text = concatenate_text_files(directory_path)

# Specify the path for the new concatenated text file
output_file_path = "/content/drive/My Drive/MemoireHN1/concatenated_text.txt"

# Write the concatenated text to the output file
with open(output_file_path, 'w') as output_file:
    output_file.write(concatenated_text)

print("Concatenated text saved to:", output_file_path)


Concatenated text saved to: /content/drive/My Drive/MemoireHN1/concatenated_text.txt


In [16]:
#!/usr/bin/env python2.7

"""
This program finds collocations in a corpus of text. It can find both the
overall top collocations, and the collocations of keywords you enter manually.
"""
import nltk
nltk.download('punkt')
import string
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures
from nltk.corpus import stopwords

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# find collocations for each word
def get_collocations(corpus, windowsize=10, numresults=10):
    '''This function uses the Natural Language Toolkit to find the top collocations in a corpus.
    It takes as an argument a string that contains the corpus you want to
    find collocations from. It prints the top collocations it finds.
    '''
    # convert the corpus (a string) into  a list of words
    tokens = word_tokenize(corpus)
    # initialize the bigram association measures object to score each collocation
    bigram_measures = BigramAssocMeasures()
    # initialize the bigram collocation finder object to find and rank collocations
    finder = BigramCollocationFinder.from_words(tokens, window_size=windowsize)
    # apply a series of filters to narrow down the collocation results
    ignored_words = stopwords.words('english')
    finder.apply_word_filter(lambda w: len(w) < 2 or w.lower() in ignored_words)
    finder.apply_freq_filter(1)
    # calculate the top results by T-score
    # list of all possible measures: .raw_freq, .pmi, .likelihood_ratio, .chi_sq, .phi_sq, .fisher, .student_t, .mi_like, .poisson_stirling, .jaccard, .dice
    results = finder.nbest(bigram_measures.student_t, numresults)
    # print the results
    print("Top " + str(numresults) + " collocations:")
    for k, v in results:
        print(str(k) + ", " + str(v))



def get_keyword_collocations(corpus, keyword, windowsize=10, numresults=10):
    '''This function uses the Natural Language Toolkit to find collocations
    for a specific keyword in a corpus. It takes as an argument a string that
    contains the corpus you want to find collocations from. It prints the top
    collocations it finds for each keyword.
    '''
    # convert the corpus (a string) into  a list of words
    tokens = word_tokenize(corpus)
    # initialize the bigram association measures object to score each collocation
    bigram_measures = BigramAssocMeasures()
    # initialize the bigram collocation finder object to find and rank collocations
    finder = BigramCollocationFinder.from_words(tokens, window_size=windowsize)
    # initialize a function that will narrow down collocates that don't contain the keyword
    keyword_filter = lambda *w: keyword not in w
    # apply a series of filters to narrow down the collocation results
    ignored_words = stopwords.words('english')
    finder.apply_word_filter(lambda w: len(w) < 2 or w.lower() in ignored_words)
    finder.apply_freq_filter(1)
    finder.apply_ngram_filter(keyword_filter)
    # calculate the top results by T-score
    # list of all possible measures: .raw_freq, .pmi, .likelihood_ratio, .chi_sq, .phi_sq, .fisher, .student_t, .mi_like, .poisson_stirling, .jaccard, .dice
    results = finder.nbest(bigram_measures.student_t, numresults)
    # print the results
    print("Top collocations for " + str(keyword) + ":")
    collocations = ''
    for k, v in results:
        if k != keyword:
            collocations += k + ' '
        else:
            collocations += v + ' '
    print("Top " + str(numresults) + " collocations:")


# Replace this with your filename
infile = "/content/drive/My Drive/MemoireHN1/concatenated_text.txt"

# Read in the corpus you want to find collocations from
with open(infile) as tmpfile:
    data = tmpfile.read()

# Clean the data
# Create a translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)
# Clean the data by applying the translation table
data = data.translate(translator)
data = "".join(i for i in data if ord(i) < 128)  # remove non-ascii characters

# Get the top collocations for the entire corpus
get_collocations(data)
print(' ')

# Replace this with a list of keywords you want to find collocations for
words_of_interest = ["Illuminati", "French", "Bavarian","France","Weishaupt"]

# Get the top collocations for each keyword in the list above
for word in words_of_interest:
    get_keyword_collocations(data, word)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Top 10 collocations:
United, States
New, York
tho, tho
one, hundred
Mr, Mr
per, cent
hundred, dollars
00, 00
one, one
per, per
 
Top collocations for Illuminati:
Top 10 collocations:
Top collocations for French:
Top 10 collocations:
Top collocations for Bavarian:
Top 10 collocations:
Top collocations for France:
Top 10 collocations:
Top collocations for Weishaupt:
Top 10 collocations:


Tentative alternative avec code suivant : https://github.com/jamesosullivan/collocates/blob/main/co-occurrences.py

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

def co_occurrence(text, window_size=2):
    # Load stopwords
    stop_words = set(stopwords.words('english'))

    # Tokenize the text and filter out stopwords
    tokens = [token.lower() for token in word_tokenize(text) if token.isalpha() and token.lower() not in stop_words]

    # Initialize co-occurrence count
    co_occurrence_counts = Counter()

    # Calculate co-occurrences within the specified window size
    for i in range(len(tokens)):
        token = tokens[i]
        start = max(0, i - window_size)
        end = min(len(tokens), i + window_size + 1)
        for j in range(start, end):
            if i != j:
                co_occurred_token = tokens[j]
                co_occurrence_counts[(token, co_occurred_token)] += 1

    # Return the top 10 most common co-occurrences
    return co_occurrence_counts.most_common(10)

# Read the text file
with open("filename.txt", "r", encoding='utf-8') as file:
    text = file.read()

# Calculate the top 10 co-occurrences
top_co_occurrences = co_occurrence(text, window_size=2)

# Output the results
with open("top-co-occurrence-results.txt", "w", encoding='utf-8') as output_file:
    for pair, freq in top_co_occurrences:
        output_file.write(f"{pair[0]}, {pair[1]}: {freq}\n")

print("Top 10 co-occurrence analysis is complete. Results are saved in 'top-co-occurrence-results.txt'.")

In [19]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

def co_occurrence(text, window_size=2):
    # Load stopwords
    stop_words = set(stopwords.words('english'))

    # Tokenize the text and filter out stopwords
    tokens = [token.lower() for token in word_tokenize(text) if token.isalpha() and token.lower() not in stop_words]

    # Initialize co-occurrence count
    co_occurrence_counts = Counter()

    # Calculate co-occurrences within the specified window size
    for i in range(len(tokens)):
        token = tokens[i]
        start = max(0, i - window_size)
        end = min(len(tokens), i + window_size + 1)
        for j in range(start, end):
            if i != j:
                co_occurred_token = tokens[j]
                co_occurrence_counts[(token, co_occurred_token)] += 1

    # Return the top 10 most common co-occurrences
    return co_occurrence_counts.most_common(10)

# Read the text file
with open("/content/drive/My Drive/MemoireHN1/concatenated_text.txt", "r", encoding='utf-8') as file:
    text = file.read()

# Calculate the top 10 co-occurrences
top_co_occurrences = co_occurrence(text, window_size=2)

# Output the results
with open("top-co-occurrence-results.txt", "w", encoding='utf-8') as output_file:
    for pair, freq in top_co_occurrences:
        output_file.write(f"{pair[0]}, {pair[1]}: {freq}\n")
print(top_co_occurrences)
print("Top 10 co-occurrence analysis is complete. Results are saved in 'top-co-occurrence-results.txt'.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[(('united', 'states'), 372), (('states', 'united'), 372), (('new', 'york'), 253), (('york', 'new'), 253), (('r', 'r'), 220), (('f', 'r'), 199), (('r', 'f'), 199), (('j', 'j'), 184), (('r', 'j'), 161), (('j', 'r'), 161)]
Top 10 co-occurrence analysis is complete. Results are saved in 'top-co-occurrence-results.txt'.
