In [18]:
import os
from collections import Counter

import numpy as np
import pandas as pd

from preprocess import (
    remove_punctuation,
    split_sentences,
    process_suffixes,
    get_suffixes,
)

In [19]:
# CONSTANTS
CORPUS_PATH = './corpus'
SUFFIXES = get_suffixes()
WORDS_COUNT_FILE = 'words_count.txt'
PAIR_COOCCURENCES_FILE = 'pair_cooccurences.txt'
MIN_COUNT_THRESHOLD = 2  # All the tokens with counts less than or equal to this will be ignored from consideration
CO_OCCURENCE_WINDOW = 2  # Consider window of two words to consider to have "co-occured"

In [20]:
# GET CORPUS
content_files = os.listdir(CORPUS_PATH)

all_content = []

for filename in content_files:
    path = os.path.join(CORPUS_PATH, filename)
    with open(path, 'r') as f:
        all_content.append(f.read())


In [21]:
def clean_and_extract_sentences(text):
    clean = remove_punctuation(text)
    sentences = split_sentences(clean)
    sentences_word_list = []
    for sentence in sentences:
        splitted = [x for x in sentence.split() if x]
        suffix_processed = process_suffixes(SUFFIXES, splitted)
        sentences_word_list.append(
            [x for x in suffix_processed if x]
        )
    return sentences_word_list

In [22]:
# Get unique words and their corresponding count within corpus
words_count = Counter()
for content in all_content:
    cleaned_sentences = clean_and_extract_sentences(content)
    for sentence in cleaned_sentences:
        words_count.update(sentence)

# Remove low count items
words_count = {k: v for k, v in words_count.items() if v > MIN_COUNT_THRESHOLD}
print(len(words_count))

43355


In [23]:
# Save words list, sorted
import time
s = time.time()
sorted_words_list = sorted(words_count.items(), key=lambda x: x[1], reverse=True)
e = time.time()
print('Time taken to sort : ', e - s)

def write_to_file():
    with open(WORDS_COUNT_FILE, 'w') as f:
        for k, v in sorted_words_list:
            f.write(f'{k} {v}\n')
write_to_file()
print('WRITTEN')

Time taken to sort :  0.01614999771118164
WRITTEN


In [24]:
# LOAD WORD IDS
words_ids = dict()
with open(WORDS_COUNT_FILE, 'r') as f:
    for i, line in enumerate(f.readlines()):
        word, _ = line.split()
        words_ids[word] = i
print('WORD_IDS LOADED')

WORD_IDS LOADED


In [25]:
def get_sentence_cooccurences(sentence_word_ids):
    occurences = Counter()
    for i in range(CO_OCCURENCE_WINDOW):
        shifted = sentence_word_ids[i+1:]
        pairs = zip(sentence_word_ids, shifted)
        sorted_pairs = [tuple(sorted(pair)) for pair in pairs]  # Only work on sorted tuple as (1,2) and (2,1) have same values in the matrix
        occurences.update(Counter(sorted_pairs))
    return occurences

print(get_sentence_cooccurences([1,2,3,4,5,3,1]))

Counter({(3, 4): 2, (3, 5): 2, (1, 3): 2, (1, 2): 1, (2, 3): 1, (4, 5): 1, (2, 4): 1, (1, 5): 1})


In [31]:
pair_cooccurences = Counter()
# Use the sentences list to create coocrurence matrix
for content in all_content:
    cleaned_sentences = clean_and_extract_sentences(content)
    for sentence in cleaned_sentences:
        word_ids = [words_ids[w] for w in sentence if w in words_ids]
        occurences = get_sentence_cooccurences(word_ids)
        pair_cooccurences.update(occurences)
print('DONE creating pair cooccurences')

DONE creating pair cooccurences


In [17]:
# Write pair_cooccurences to a file
print(len(pair_cooccurences))
with open(PAIR_COOCCURENCES_FILE, 'w') as f:
    for (w1_id, w2_id), cooccurences in pair_cooccurences.items():
        f.write(f'{w1_id} {w2_id} {cooccurences}\n')
print(f'Written cooccurences to file {PAIR_COOCCURENCES_FILE}')

2807077


NameError: name 'w1' is not defined