## Importing Libraries

In [98]:
import pandas as pd
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import operator
import string
import nltk
from nltk.probability import FreqDist

## Testing Stanford CoreNLP

In [2]:
from stanfordcorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('C:/Users/daksh/OneDrive/Desktop/stanford-corenlp-4.5.1', memory='8g')

# The sentence you want to parse
sentence = "I'm Looking for can't wouldn't 03/04/1999 New York New-York Rock 'n' Roll 55,500.72 Google.com Yahoo!"

# POS
print('POS：', nlp.pos_tag(sentence))

# Tokenize
print('Tokenize：', nlp.word_tokenize(sentence))

# NER
print('NER：', nlp.ner(sentence))

# Parser
print('Parser：')
print(nlp.parse(sentence))
print(nlp.dependency_parse(sentence))

# Close Stanford Parser
nlp.close()

POS： [('I', 'PRP'), ("'m", 'VBP'), ('Looking', 'VBG'), ('for', 'IN'), ('ca', 'MD'), ("n't", 'RB'), ('would', 'MD'), ("n't", 'RB'), ('03/04/1999', 'CD'), ('New', 'NNP'), ('York', 'NNP'), ('New', 'NNP'), ('-', 'HYPH'), ('York', 'NNP'), ('Rock', 'NNP'), ("'n'", 'CC'), ('Roll', 'NNP'), ('55,500.72', 'CD'), ('Google.com', 'NNP'), ('Yahoo!', 'NNP')]
Tokenize： ['I', "'m", 'Looking', 'for', 'ca', "n't", 'would', "n't", '03/04/1999', 'New', 'York', 'New', '-', 'York', 'Rock', "'n'", 'Roll', '55,500.72', 'Google.com', 'Yahoo!']
NER： [('I', 'O'), ("'m", 'O'), ('Looking', 'O'), ('for', 'O'), ('ca', 'O'), ("n't", 'O'), ('would', 'O'), ("n't", 'O'), ('03/04/1999', 'DATE'), ('New', 'STATE_OR_PROVINCE'), ('York', 'STATE_OR_PROVINCE'), ('New', 'STATE_OR_PROVINCE'), ('-', 'STATE_OR_PROVINCE'), ('York', 'STATE_OR_PROVINCE'), ('Rock', 'O'), ("'n'", 'O'), ('Roll', 'O'), ('55,500.72', 'NUMBER'), ('Google.com', 'URL'), ('Yahoo!', 'O')]
Parser：
(ROOT
  (FRAG
    (S
      (NP (PRP I))
      (VP (VBP 'm)
      

## Testing NLTK

In [3]:
string ="I'm Looking for can't wouldn't 03/04/1999 New York New-York Rock 'n' Roll 55,500.72 Google.com Yahoo!"
test_tokenizer = word_tokenize(string)
print(test_tokenizer)

['I', "'m", 'Looking', 'for', 'ca', "n't", 'would', "n't", '03/04/1999', 'New', 'York', 'New-York', 'Rock', "'", 'n', "'", 'Roll', '55,500.72', 'Google.com', 'Yahoo', '!']


## Importing the file

In [4]:
file_content = open('C:/Users/daksh/Downloads/concat.txt', encoding="utf8").read()

In [5]:
type(file_content)

str

In [6]:
len(file_content)

26807133

# Part 1

### A)

In [7]:
#Tokenizing the corpus

tokens = word_tokenize(file_content)

In [8]:
len(tokens)

4789324

In [9]:
tokens[0:50]

['CO-BRANDING',
 'AND',
 'ADVERTISING',
 'AGREEMENT',
 'THIS',
 'CO-BRANDING',
 'AND',
 'ADVERTISING',
 'AGREEMENT',
 '(',
 'the',
 '``',
 'Agreement',
 "''",
 ')',
 'is',
 'made',
 'as',
 'of',
 'June',
 '21',
 ',',
 '1999',
 '(',
 'the',
 '``',
 'Effective',
 'Date',
 "''",
 ')',
 'by',
 'and',
 'between',
 'I-ESCROW',
 ',',
 'INC.',
 ',',
 'with',
 'its',
 'principal',
 'place',
 'of',
 'business',
 'at',
 '1730',
 'S.',
 'Amphlett',
 'Blvd.',
 ',',
 'Suite']

In [10]:
#Writing the tokens to output.txt

with open("C:/Users/daksh/OneDrive/Desktop/NLP/output.txt", "w", encoding="utf-8") as output:
    for token in tokens:   
        output.write(token)
        output.write("\n")

In [11]:
#Converting the tokens to lower case for regularity
tokens = [token.lower() for token in tokens]

In [12]:
len(tokens)

4789324

In [13]:
tokens[0:50]

['co-branding',
 'and',
 'advertising',
 'agreement',
 'this',
 'co-branding',
 'and',
 'advertising',
 'agreement',
 '(',
 'the',
 '``',
 'agreement',
 "''",
 ')',
 'is',
 'made',
 'as',
 'of',
 'june',
 '21',
 ',',
 '1999',
 '(',
 'the',
 '``',
 'effective',
 'date',
 "''",
 ')',
 'by',
 'and',
 'between',
 'i-escrow',
 ',',
 'inc.',
 ',',
 'with',
 'its',
 'principal',
 'place',
 'of',
 'business',
 'at',
 '1730',
 's.',
 'amphlett',
 'blvd.',
 ',',
 'suite']

### B)

In [14]:
#Number of tokens in the corpus

print("Number of tokens in the corpus: ",len(tokens))

Number of tokens in the corpus:  4789324


In [15]:
#Lemmatizing the document so as to get correct list of unique tokens

pos_tokens = pos_tag(tokens)


lemmatizer = WordNetLemmatizer()
data = []
for word,tag in pos_tokens:
    wtag = tag[0].lower()
    wtag = wtag if wtag in ['a', 'r', 'n', 'v'] else None
    if not wtag:
        lemma = word
    else:
        lemma = lemmatizer.lemmatize(word,wtag)
    data.append(lemma)
    
    
data[0:500]

['co-branding',
 'and',
 'advertising',
 'agreement',
 'this',
 'co-branding',
 'and',
 'advertising',
 'agreement',
 '(',
 'the',
 '``',
 'agreement',
 "''",
 ')',
 'be',
 'make',
 'as',
 'of',
 'june',
 '21',
 ',',
 '1999',
 '(',
 'the',
 '``',
 'effective',
 'date',
 "''",
 ')',
 'by',
 'and',
 'between',
 'i-escrow',
 ',',
 'inc.',
 ',',
 'with',
 'its',
 'principal',
 'place',
 'of',
 'business',
 'at',
 '1730',
 's.',
 'amphlett',
 'blvd.',
 ',',
 'suite',
 '233',
 ',',
 'san',
 'mateo',
 ',',
 'california',
 '94402',
 '(',
 '``',
 'i-escrow',
 "''",
 ')',
 ',',
 'and',
 '2themart.com',
 ',',
 'inc.',
 'have',
 'its',
 'principal',
 'place',
 'of',
 'business',
 'at',
 '18301',
 'von',
 'karman',
 'avenue',
 ',',
 '7th',
 'floor',
 ',',
 'irvine',
 ',',
 'california',
 '92612',
 '(',
 '``',
 '2themart',
 "''",
 ')',
 '.',
 '1',
 '.',
 'definition',
 '.',
 '(',
 'a',
 ')',
 '``',
 'content',
 "''",
 'mean',
 'all',
 'content',
 'or',
 'information',
 ',',
 'in',
 'any',
 'medium',

In [16]:
len(data)

4789324

In [17]:
#Unique tokens in the corpus using Set data structure

unique_tokens = set(data)
print("Number of unique tokens in the corpus: ",len(unique_tokens))

Number of unique tokens in the corpus:  42674


In [18]:
#Unique tokens in the corpus using dictionary data structure

unique_tokens = {}

for token in data:
    val=unique_tokens.get(token)
    
    if val is None:
        unique_tokens[token]=1
    else:
        unique_tokens[token]=val+1

In [19]:
len(unique_tokens)

42674

In [20]:
for key in unique_tokens:
    print(key,"  ",unique_tokens.get(key))

co-branding    99
and    129054
advertising    1610
agreement    45735
this    39986
(    75436
the    257132
``    21646
''    22771
)    78092
be    80380
make    7629
as    32907
of    156122
june    290
21    588
,    240576
1999    218
effective    4252
date    11180
by    44310
between    3636
i-escrow    66
inc.    3797
with    33883
its    19777
principal    1004
place    1293
business    7512
at    13198
1730    4
s.    120
amphlett    2
blvd.    33
suite    579
233    6
san    201
mateo    13
california    389
94402    2
2themart.com    2
have    21719
18301    2
von    4
karman    4
avenue    283
7th    25
floor    216
irvine    10
92612    4
2themart    59
.    117447
1    5176
definition    826
a    51002
content    2367
mean    7037
all    21931
or    105155
information    10920
in    79933
any    62236
medium    788
provide    14903
party    46335
to    129875
other    26395
for    38724
use    12066
conjunction    138
performance    3201
obligation    7802
hereunder    

systemoverall    1
17.6.2    1
set-    3
7930    1
mclean    7
22102    4
acquit    1
17.11.1    1
17.11.2    1
17.11.3    1
17.11.4    1
17.11.5    1
17.11.6    1
bearer    3
17.13.1    1
17.13.1.1    1
17.13.1.2    1
17.13.1.3    1
17.13.1.4    1
punishable    2
17.13.1.5    1
17.13.1.6    1
17.13.1.7    1
17.13.1.8    1
17.13.2    1
19m    1
home2    1
tagline    3
trs    4
galloway    5
415-6500    1
juallowavraricap.com    1
crestline    1
3950    1
22030    1
james.carrollcrestlinehotels.com    1
pierre.donahueacrestlinehotels.com    1
6905    2
06614    1
midnight    57
requirements/special    1
14.4.1.1.    1
lessor/sublessor    1
corn    1
holthouser    4
3/14/14    1
lir    1
3/i4/11    1
homewoo    1
delawar    1
uites    1
nifty    1
inncotle    1
borct    1
ed:41017    1
relicensed    1
maim    1
kenneth-savage    1
jan4-872013    1
date-    1
sep-08-2013    2
bratamanauement    1
fla    1
hight    1
corinne.hight    1
hilton.com    1
901-374-6044    1
2005-02-14    1
corr

In [21]:
#Type/Token ratio

print("The type/token ratio is: ",len(unique_tokens),"/",len(data),"\t OR ",len(unique_tokens)/len(data))

The type/token ratio is:  42674 / 4789324 	 OR  0.008910234513263249


### C

In [22]:
#Sorting in descending order of frequency

sorted_tokens = dict( sorted(unique_tokens.items(), key=operator.itemgetter(1),reverse=True))

In [23]:
#Writing the tokens and their frequency to tokens.txt

with open("C:/Users/daksh/OneDrive/Desktop/NLP/tokens.txt", "w", encoding="utf-8") as output:
    for token in sorted_tokens:
        string = "Token: {} \t Frequency:{}".format(token,sorted_tokens.get(token))
        output.write(string)
        output.write("\n")

### D)

In [24]:
# Counting the number of tokens that appeared only once in the corpus

# O(N) Time complexity

freq_once = 0

for key,value in sorted_tokens.items():
    if value==1:
        freq_once = freq_once+1
    else:
        continue

print("Number of tokens only appearing once are: ",freq_once)

Number of tokens only appearing once are:  19328


### E)

In [25]:
#Remove Http URLs
import re

removed_http=[]

for d in data:
    val = re.sub(r'http\S+', '', d)
    if val!='':
        removed_http.append(val)

In [26]:
#Remove digits not attached to words

test_string="October 2, 2010 ? - @tell 02/01/2010 #co-brand hello, 2.00 78% & 76% Blvd., Hi !.... $3000 www.hello.com 2010 'Apologies' hello. can't 50.00 hello 2TheMart. The end."
test = word_tokenize(test_string)

removed_dig=[]
for d in removed_http:
    val = re.sub(r'\b\d+\b', '', d)
    if val!='':
        removed_dig.append(val)

In [27]:
#Remove the punctuations and other sybmols

regex = r"(?<!\w)[!.'""#$%&()*+,-/:;<=>?@[\]^_`{|}~](?!\w)"

removed_pun=[]
for d in removed_dig:
    val = re.sub(regex, "", d)
    if val!='':
        removed_pun.append(val)

removed_punc=[]
for d in removed_pun:
    if d[0]=="'" or d[0]=='"':
        d=d[1:len(d)]
    removed_punc.append(d)

In [28]:
len(removed_punc)

3891726

In [29]:
removed_punc[0:500]

['co-branding',
 'and',
 'advertising',
 'agreement',
 'this',
 'co-branding',
 'and',
 'advertising',
 'agreement',
 'the',
 'agreement',
 'be',
 'make',
 'as',
 'of',
 'june',
 'the',
 'effective',
 'date',
 'by',
 'and',
 'between',
 'i-escrow',
 'inc.',
 'with',
 'its',
 'principal',
 'place',
 'of',
 'business',
 'at',
 's.',
 'amphlett',
 'blvd.',
 'suite',
 'san',
 'mateo',
 'california',
 'i-escrow',
 'and',
 '2themart.com',
 'inc.',
 'have',
 'its',
 'principal',
 'place',
 'of',
 'business',
 'at',
 'von',
 'karman',
 'avenue',
 '7th',
 'floor',
 'irvine',
 'california',
 '2themart',
 'definition',
 'a',
 'content',
 'mean',
 'all',
 'content',
 'or',
 'information',
 'in',
 'any',
 'medium',
 'provide',
 'by',
 'a',
 'party',
 'to',
 'the',
 'other',
 'party',
 'for',
 'use',
 'in',
 'conjunction',
 'with',
 'the',
 'performance',
 'of',
 'its',
 'obligation',
 'hereunder',
 'include',
 'without',
 'limitation',
 'any',
 'text',
 'music',
 'sound',
 'photograph',
 'video',
 

In [41]:
def frequent_words(param_data):
    fdist = FreqDist(token for token in param_data)
    return fdist

In [45]:
fdist = frequent_words(removed_punc)
topWords = fdist.most_common(20)
unique_lex_diversity = set(removed_punc)

In [44]:
topWords

[('the', 257136),
 ('of', 156122),
 ('to', 129875),
 ('and', 129054),
 ('or', 105156),
 ('be', 80380),
 ('in', 79934),
 ('any', 62236),
 ('a', 51002),
 ('shall', 48794),
 ('party', 46335),
 ('agreement', 45737),
 ('by', 44310),
 ('this', 39986),
 ('for', 38724),
 ('such', 36172),
 ('with', 33883),
 ('as', 32909),
 ('that', 27654),
 ('other', 26395)]

In [47]:
print("Unique tokens after removal of punctuations and digits: ",len(unique_lex_diversity))

Unique tokens after removal of punctuations and digits:  31031


In [49]:
print("Lexical Diversity: ", len(unique_lex_diversity)," / ",len(removed_punc),"\t OR ",len(unique_lex_diversity)/len(removed_punc))

Lexical Diversity:  31031  /  3891726 	 OR  0.007973582929527927


### F)

In [85]:
import urllib.request

#Loading stopwords from assignment description
data = urllib.request.urlopen("https://www.site.uottawa.ca/~diana/csi5180/StopWords")
uni_stopwords = []
for line in data:
    line = line.decode('utf-8')

    uni_stopwords.append(line.rstrip('\n'))

uni_stopwords

['a',
 'about',
 'above',
 'ac',
 'according',
 'accordingly',
 'across',
 'actually',
 'ad',
 'adj',
 'af',
 'after',
 'afterwards',
 'again',
 'against',
 'al',
 'albeit',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'als',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'apart',
 'apparently',
 'are',
 'aren',
 'arise',
 'around',
 'as',
 'aside',
 'at',
 'au',
 'auf',
 'aus',
 'aux',
 'av',
 'avec',
 'away',
 'b',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'began',
 'begin',
 'beginning',
 'begins',
 'behind',
 'bei',
 'being',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'billion',
 'both',
 'briefly',
 'but',
 'by',
 'c',
 'came',
 'can',
 'cannot',
 'canst',
 'caption',
 'captions',
 'certain',
 'certainly',
 'cf',
 'choose',
 'chooses',
 'choosing',
 'chose',
 'c

In [89]:
# Removing stop words
stop_words = stopwords.words('english') #NLTK stopwords
stop_words.extend(uni_stopwords) #Adding NLTK stopwords and University provided stopwords

stopwords= set(stop_words)

stopwords

179
812


{'a',
 'about',
 'above',
 'ac',
 'according',
 'accordingly',
 'across',
 'actually',
 'ad',
 'adj',
 'af',
 'after',
 'afterwards',
 'again',
 'against',
 'ain',
 'al',
 'albeit',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'als',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'apart',
 'apparently',
 'are',
 'aren',
 "aren't",
 'arise',
 'around',
 'as',
 'aside',
 'at',
 'au',
 'auf',
 'aus',
 'aux',
 'av',
 'avec',
 'away',
 'b',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'began',
 'begin',
 'beginning',
 'begins',
 'behind',
 'bei',
 'being',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'billion',
 'both',
 'briefly',
 'but',
 'by',
 'c',
 'came',
 'can',
 'cannot',
 'canst',
 'caption',
 'captions',
 'certain',
 'certainly',
 'cf',
 'choose',
 'chooses',
 'choos

In [90]:
words = [word for word in removed_punc if word not in stopwords]

In [91]:
words[0:500]

['co-branding',
 'advertising',
 'agreement',
 'co-branding',
 'advertising',
 'agreement',
 'agreement',
 'june',
 'effective',
 'date',
 'i-escrow',
 'inc.',
 'principal',
 'place',
 'business',
 's.',
 'amphlett',
 'blvd.',
 'suite',
 'san',
 'mateo',
 'california',
 'i-escrow',
 '2themart.com',
 'inc.',
 'principal',
 'place',
 'business',
 'karman',
 'avenue',
 '7th',
 'floor',
 'irvine',
 'california',
 '2themart',
 'definition',
 'content',
 'mean',
 'content',
 'information',
 'medium',
 'party',
 'party',
 'conjunction',
 'performance',
 'obligation',
 'hereunder',
 'limitation',
 'text',
 'music',
 'sound',
 'photograph',
 'video',
 'graphic',
 'data',
 'software',
 'content',
 '2themart',
 'refer',
 '2themart',
 'content',
 'content',
 'i-escrow',
 'refer',
 'i-escrow',
 'content',
 'co-branded',
 'site',
 'mean',
 'web-site',
 'accessible',
 'domain',
 'name',
 'service',
 'implement',
 'i-escrow',
 'homepage',
 'web-site',
 'visibly',
 'display',
 '2themart',
 'mark',
 'i-

In [92]:
#Top 20 words and their frequencies

fdist = frequent_words(words)
topWords = fdist.most_common(20)
topWords

[('party', 46335),
 ('agreement', 45737),
 ('product', 18925),
 ('right', 14725),
 ('section', 14216),
 ('term', 13292),
 ('company', 12905),
 ('service', 12293),
 ('date', 11180),
 ('information', 10920),
 ('agree', 8946),
 ('write', 8614),
 ('material', 8145),
 ('law', 8050),
 ('notice', 7815),
 ('obligation', 7802),
 ('applicable', 7533),
 ('business', 7512),
 ('set', 7289),
 ('respect', 7055)]

In [95]:
#Lexical density
unique_lexdensity = set(words)
print("Unique token types after removing stopwords: ",len(unique_lexdensity))
print("Lexical density: ",len(unique_lexdensity)," / ",len(words),"\t OR ",len(unique_lexdensity)/len(words))

Unique token types after removing stopwords:  30474
Lexical density:  30474  /  1868715 	 OR  0.016307462614684423


### G)

In [102]:
#Computing bigrams

nltk_bigrams = nltk.bigrams(words)

In [103]:
bigrams = list(nltk_bigrams)

In [104]:
bigrams[0:10]

[('co-branding', 'advertising'),
 ('advertising', 'agreement'),
 ('agreement', 'co-branding'),
 ('co-branding', 'advertising'),
 ('advertising', 'agreement'),
 ('agreement', 'agreement'),
 ('agreement', 'june'),
 ('june', 'effective'),
 ('effective', 'date'),
 ('date', 'i-escrow')]

In [105]:
freq_bigrams = frequent_words(bigrams)
topGrams = freq_bigrams.most_common(20)
topGrams

[(('confidential', 'information'), 3601),
 (('intellectual', 'property'), 2927),
 (('term', 'agreement'), 2911),
 (('effective', 'date'), 2846),
 (('…', '…'), 2466),
 (('write', 'notice'), 2413),
 (('agreement', 'party'), 2380),
 (('term', 'condition'), 2190),
 (('applicable', 'law'), 2082),
 (('party', 'party'), 1967),
 (('party', 'agree'), 1962),
 (('set', 'section'), 1914),
 (('prior', 'write'), 1814),
 (('provision', 'agreement'), 1671),
 (('confidential', 'treatment'), 1535),
 (('receive', 'party'), 1515),
 (('termination', 'agreement'), 1436),
 (('security', 'exchange'), 1423),
 (('disclose', 'party'), 1422),
 (('pursuant', 'section'), 1416)]