# Imports

In [1]:
import os
import glob
import json
import nltk, re, string, collections
from nltk.util import ngrams
from collections import Counter
import pandas as pd
from nltk.tokenize import word_tokenize 

# readJson function

In [2]:
def readJson(fileName):
    f = open(file, "r", encoding='utf-8')
    jsonData = json.load(f)
    words = ""
    for _, value in jsonData.items():
        words = words + value
    return words

# Find ngrams with stopwords

In [3]:
path = "testDatasets/*.json"
files = glob.glob(path)

words = ""

for file in files:
    jsonWords = readJson(file)
    words = words + jsonWords
    
tokenizedWords = [word.lower() for word in words.split() if len(word) > 1]

ngramList = [
    {"n": 1, "prefix": "unigram"},
    {"n": 2, "prefix": "bigram"},
    {"n": 3, "prefix": "trigram"}
]

for ngram in ngramList:
    n = ngrams(tokenizedWords, ngram['n'])
    nFreq = collections.Counter(n)
    df = pd.DataFrame(nFreq.most_common(None), columns = ['Words', 'Frequency'])
    df = df.sort_values(by='Frequency', ascending=False)
    p = './outputs/' + ngram['prefix'] + '-with-stopwords.csv'
    df.to_csv(p, index = True, header=True)
    

# Read stopwords file

In [4]:
file = open("stopwords.txt", "r", newline='', encoding='utf-8')
result = file.read()
stopwords = word_tokenize(result)

# Find ngrams without stopwords

In [6]:
path = "testDatasets/*.json"
files = glob.glob(path)

words = ""

for file in files:
    jsonWords = readJson(file)
    words = words + jsonWords
    
tokenizedWords = [word.lower() for word in words.split() if word.lower() not in stopwords and len(word) > 1]

ngramList = [
    {"n": 1, "prefix": "unigram"},
    {"n": 2, "prefix": "bigram"},
    {"n": 3, "prefix": "trigram"}
]

for ngram in ngramList:
    n = ngrams(tokenizedWords, ngram['n'])
    nFreq = collections.Counter(n)
    df = pd.DataFrame(nFreq.most_common(None), columns = ['Words', 'Frequency'])
    df = df.sort_values(by='Frequency', ascending=False)
    p = './outputs/' + ngram['prefix'] + '-without-stopwords.csv'
    df.to_csv(p, index = True, header=True)
    

# Print filtered tokens to file

In [8]:
file = open('./filtered-tokens.txt','a+', encoding='utf-8')
for idx in tokenizedWords:
    file.write(" "+idx.lower())

file.close()

# Find collocations

In [9]:
file = open("./filtered-tokens.txt", "r", newline='', encoding='utf-8')
result = file.read()

tokenized = [idx.lower() for idx in result.split()]

bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(tokenized)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(tokenized)

bigramFinder.apply_freq_filter(20)
trigramFinder.apply_freq_filter(20)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['Bigram','Frequency']).sort_values(by='Frequency', ascending=False)
bigramFreqTable.to_csv('./ngrams/bigrams.csv', index = False, header=True)

trigram_freq = trigramFinder.ngram_fd.items()
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['Trigram','Frequency']).sort_values(by='Frequency', ascending=False)
trigramFreqTable.to_csv('./ngrams/trigrams.csv', index = False, header=True)

# Methods

## PMI

In [10]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)
bigramPMITable.to_csv('./methods/bigram/bigramPMITable.csv', index = False, header=True)

trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)
trigramPMITable.to_csv('./methods/trigram/trigramPMITable.csv', index = False, header=True)

## T-test

In [11]:
bigramTtable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)
bigramTtable.to_csv('./methods/bigram/bigramTtable.csv', index = False, header=True)

trigramTtable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.student_t)), columns=['trigram','t']).sort_values(by='t', ascending=False)
trigramTtable.to_csv('./methods/trigram/trigramTtable.csv', index = False, header=True)

## Chi-Square

In [12]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)
bigramChiTable.to_csv('./methods/bigram/bigramChiTable.csv', index = False, header=True)

trigramChiTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.chi_sq)), columns=['trigram','chi-sq']).sort_values(by='chi-sq', ascending=False)
trigramChiTable.to_csv('./methods/trigram/trigramChiTable.csv', index = False, header=True)

## Raw Frequency

In [13]:
bigramRawFreqTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.raw_freq)), columns=['bigram','raw_freq']).sort_values(by='raw_freq', ascending=False)
bigramRawFreqTable.to_csv('./methods/bigram/bigramRawFreqTable.csv', index = False, header=True)

trigramRawFreqTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.raw_freq)), columns=['trigram','raw_freq']).sort_values(by='raw_freq', ascending=False)
trigramRawFreqTable.to_csv('./methods/trigram/trigramRawFreqTable.csv', index = False, header=True)

## Poisson Stirling

In [14]:
bigramPoissonTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.poisson_stirling)), columns=['bigram','poisson_stirling']).sort_values(by='poisson_stirling', ascending=False)
bigramPoissonTable.to_csv('./methods/bigram/bigramPoissonTable.csv', index = False, header=True)

trigramPoissonTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.poisson_stirling)), columns=['trigram','poisson_stirling']).sort_values(by='poisson_stirling', ascending=False)
trigramPoissonTable.to_csv('./methods/trigram/trigramPoissonTable.csv', index = False, header=True)

## Likelihood Ratio

In [15]:
bigramLikelihoodTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood_ratio']).sort_values(by='likelihood_ratio', ascending=False)
bigramLikelihoodTable.to_csv('./methods/bigram/bigramLikelihoodTable.csv', index = False, header=True)

trigramLikelihoodTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood_ratio']).sort_values(by='likelihood_ratio', ascending=False)
trigramLikelihoodTable.to_csv('./methods/trigram/trigramLikelihoodTable.csv', index = False, header=True)

## Jaccard

In [16]:
bigramJaccardTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.jaccard)), columns=['bigram','jaccard']).sort_values(by='jaccard', ascending=False)
bigramJaccardTable.to_csv('./methods/bigram/bigramJaccardTable.csv', index = False, header=True)

trigramJaccardTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.jaccard)), columns=['trigram','jaccard']).sort_values(by='jaccard', ascending=False)
trigramJaccardTable.to_csv('./methods/trigram/trigramJaccardTable.csv', index = False, header=True)