In [10]:
# CSI 5386 - Natural Language Programming
# FALL 2022
# Assignement 1 : Corpus analysis and word embeddings
# CHATTERJEE Micthell, HEYMANS Adrien, SEWPAL Bhavika

In [11]:
# The first step before pre-processing or analysis is to concatenate all the documents together. (Only needed to be executed once)

import glob
import shutil

outfilename = "data/mergedFiles.txt"
with open(outfilename, 'wb') as outfile:
    for filename in glob.glob('CUAD_v1/full_contract_txt/*.txt'):
        if filename == outfilename:
            # don't want to copy the output into the output
            continue
        with open(filename, 'rb') as readfile:
            shutil.copyfileobj(readfile, outfile)

In [12]:
#Creating a dataframe to store our results
import pandas as pd
results_table = pd.DataFrame(columns=["Name","Result"])

In [13]:
#The following step is the pre-processing, we need to make sure that the data is as clean as possible. We want to have every word in lower case, remove punctuation,...


from nltk.probability import FreqDist
from nltk.tokenize import (TreebankWordTokenizer,word_tokenize, wordpunct_tokenize,TweetTokenizer,MWETokenizer)

filename=open("data/mergedFiles.txt","r")
tokens = []

tokenizer = TreebankWordTokenizer()

for line in filename.readlines():
    #putting everything to lowercase
    line_lowerCase = line.lower()
    tokens+=tokenizer.tokenize(line_lowerCase)
with open('outputs/output.txt', 'w') as f:
    f.write(str(tokens))

    
fdist = FreqDist(tokens)
types =fdist.most_common()

frequenceIsOne =0
with open('outputs/tokens.txt', 'w') as f:
    for elem in types:
        if elem[1]==1:
            frequenceIsOne=frequenceIsOne+1
        f.write(str(elem))


# print("Number of tokens : "+str(len(tokens)))
# print("Number of types : "+str(len(types)))
# print("Type/Token ratio : "+str(len(types)/len(tokens)))
# print("Tokens appeared only once : "+str(frequenceIsOne))

#Addinf results to result table
tp = {"Name":"# of tokens (b)","Result":str(len(tokens))}
results_table = results_table.append(tp,ignore_index=True)
tp = {"Name":"# of types (b)","Result":str(len(types))}
results_table = results_table.append(tp,ignore_index=True)
tp = {"Name":"type/token ratio (b)","Result":str(len(types)/len(tokens))}
results_table = results_table.append(tp,ignore_index=True)
tp = {"Name":"tokens appeared only once (d)","Result":str(frequenceIsOne)}
results_table = results_table.append(tp,ignore_index=True)

In [14]:
#We will now look into the number of tokens when we remove the punctuation
from secrets import token_urlsafe
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer

# The following tokenzier will remove punctuation
tokenizer = RegexpTokenizer(r'\w+')

filename=open("data/mergedFiles.txt","r")
tokensWithoutPunctuation = []


for line in filename.readlines():
    #putting everything to lowercase
    line_lowerCase = line.lower()    
    tokensWithoutPunctuation+=tokenizer.tokenize(line_lowerCase)

#The last thing to do is to remove the page division character, when we look at the tokens, we noticed that that there are a lot of substring of "____", we want to remove these 
tokensWithoutPunctuation = [item for item in tokensWithoutPunctuation if "___" not in item]

#Saving tokens to a file
with open('outputs/tokens_without_punctuation.txt', 'w') as f:
    f.write(str(tokensWithoutPunctuation))


fdist = FreqDist(tokensWithoutPunctuation)
types =fdist.most_common()

frequenceIsOne = 0


with open('outputs/tokens_without_punctuation_count.txt', 'w') as f:
    for elem in types:
        if elem[1]==1:
            frequenceIsOne=frequenceIsOne+1
        f.write(str(elem))


# print("Number of words (exluding punctuation) : "+str(len(types)))
# print("Type/Token ratio (exluding punctuation) : "+str(len(types)/len(tokensWithoutPunctuation)))
# print("The top 3 most frequent words are : "+str(fdist.most_common(3)))
# print("Tokens appeared only once : "+str(frequenceIsOne))


#Addinf results to result table
tp = {"Name":"# of words (excluding punctuation) (e)","Result":str(len(types))}
results_table = results_table.append(tp,ignore_index=True)
tp = {"Name":"type/token ratio (excluding punctuation) (e)","Result":str(len(types)/len(tokensWithoutPunctuation))}
results_table = results_table.append(tp,ignore_index=True)
tp = {"Name":"List the top 3 most frequent words and their frequencies (e)","Result":str(fdist.most_common(3))}
results_table = results_table.append(tp,ignore_index=True)


In [15]:
#We are now looking at a tokenizer to remove the punctuation and the stopwords 
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer

# The following tokenzier will remove punctuation
tokenizer = RegexpTokenizer(r'\w+')

filename=open("data/mergedFiles.txt","r")

#import the stopwords
stopwords_txt = open("data/stopwords.txt","r")
stopwords = stopwords_txt.read().splitlines()

tokensWithoutPunctuation = []


for line in filename.readlines():
    #putting everything to lowercase
    line_lowerCase = line.lower()
    tokensWithoutPunctuation+=tokenizer.tokenize(line_lowerCase)
with open('outputs/tokens_without_punctuation.txt', 'w') as f:
    f.write(str(tokensWithoutPunctuation))

#The last thing to do is to remove the page division character, when we look at the tokens, we noticed that that there are a lot of substring of "____", we want to remove these 
tokensWithoutPunctuation = [item for item in tokensWithoutPunctuation if "___" not in item]

#removing th stopwords 
tokensWithoutPunctuationAndStopwords = [] 


#checking if thw word is a stopword, if it is not, then we are keeping it
for word in tokensWithoutPunctuation:
     if not(word in stopwords):
        tokensWithoutPunctuationAndStopwords.append(word)


fdist = FreqDist(tokensWithoutPunctuationAndStopwords)
types =fdist.most_common()

frequenceIsOne = 0

with open('outputs/tokens_without_punctuation_stopwords_count.txt', 'w') as f:
    for elem in types:
        if elem[1]==1:
            frequenceIsOne=frequenceIsOne+1
        f.write(str(elem))


# print("Number of words (exluding punctuation and stopwords) : "+str(len(types)))
# print("Type/Token ratio (exluding punctuation and stopwords) : "+str(len(types)/len(tokensWithoutPunctuationAndStopwords)))
# print("The top 3 most frequent words are : "+str(fdist.most_common(3)))
# print("Tokens appeared only once : "+str(frequenceIsOne))
tp = {"Name":"type/token ratio (excluding punctuation and stopwords) (f)","Result":str(len(types)/len(tokensWithoutPunctuationAndStopwords))}
results_table = results_table.append(tp,ignore_index=True)
tp = {"Name":"List the top 3 most frequent words and their frequencies (excluding stopwords) (f)","Result":str(fdist.most_common(3))}
results_table = results_table.append(tp,ignore_index=True)

In [16]:
# We are now asked to find the bigrams (excluding stopworda and punctuation) and compute their frequencies 


from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist

# The following tokenzier will remove punctuation
tokenizer = RegexpTokenizer(r'\w+')

#openning the corpus
filename=open("data/mergedFiles.txt","r")

#import the stopwords
stopwords_txt = open("data/stopwords.txt","r")
stopwords = stopwords_txt.read().splitlines()

#The list that will store all the bigrams
bigrams = []

#Tokenizing the sentence and removing stopword.
for line in  filename.readlines():
    #putting everything to lowercase
    
    line_lowerCase = line.lower()

    #tokenize thr sentence
    tokensWithoutPunctuation=tokenizer.tokenize(line_lowerCase)

    #removing the stopwords
    tokenstokensWithoutPunctuationStopwords = [word for word in tokensWithoutPunctuation if not word in stopwords]

    # Removing the "___" substring 
    tokenstokensWithoutPunctuationStopwords = [item for item in tokenstokensWithoutPunctuationStopwords if "___" not in item]

    #creating the list of bigrams
    for elem in list(ngrams(tokenstokensWithoutPunctuationStopwords,2)):
        bigrams.append(elem)
    #print(ngrams(tokenstokensWithoutPunctuationStopwords,2))
    
#Using a frequency list to check the frequency of all the bigrams
#print(bigrams)
bigramsFreq = FreqDist(bigrams)
typesBigrams =bigramsFreq.most_common()
#print("The top 3 most common bigrams are : "+str(bigramsFreq.most_common(3)))

#saving the bigrams in a file
with open('outputs/bigrams.txt', 'w') as f:
    for elem in typesBigrams:
        f.write(str(elem))
#Adding to the result table
tp = {"Name":"List the top 3 most frequent bigrams and their frequencies (g)","Result":bigramsFreq.most_common(3)}
results_table = results_table.append(tp,ignore_index=True)

In [17]:
#saving the results table as a csv file
results_table.to_csv("results/results.csv")