# Collocations

In [1]:
#load all libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string

In [2]:
#load reviews data
reviews = pd.read_excel('C:/Users/beyku/CPT346 Assignment 1/data.csv', quotechar="'")

In [3]:
reviews.head(25)

Unnamed: 0,Date,News,Source
0,2007-12-15,Credit market strains hang on banks results\n\...,https://www.reuters.com/article/us-investment-...
1,2007-12-15,Dollar likely to weather any U.S. recession\n\...,https://www.reuters.com/article/us-investment-...
2,2007-12-14,Central bank plan delays day of reckoning\n\nN...,https://www.reuters.com/article/us-investment-...
3,2007-12-14,Inflation seen hanging over 2008 world economy...,https://www.reuters.com/article/us-investment-...
4,2007-12-14,Investors see housing as top U.S. election the...,https://www.reuters.com/article/us-investment-...
5,2007-12-31,2007 Market in Review: Burning Down the House\...,https://www.cnbc.com/id/22452086
6,2007-12-18,Fed Shrugged as Subprime Crisis Spread\n\nWASH...,https://www.nytimes.com/2007/12/18/business/18...
7,2007-12-12,"Greenspan: subprime ""accident waiting to happe...",https://www.reuters.com/article/us-greenspan-e...
8,2007-12-12,Fed cuts rates by a quarter point\nBen Bernank...,https://money.cnn.com/2007/12/11/news/economy/...
9,2007-12-30,Stocks End the Year With Only Modest Gains\n\n...,https://www.cnbc.com/id/22422124


Extract only the reviews...

In [4]:
comments = reviews['News']

## Preprocessing

In [5]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [6]:
comments = comments.astype('str')

In [7]:
#remove non-ascii characters
comments = comments.map(lambda x: _removeNonAscii(x))

In [8]:
comments.head()

0    Credit market strains hang on banks results\n\...
1    Dollar likely to weather any U.S. recession\n\...
2    Central bank plan delays day of reckoning\n\nN...
3    Inflation seen hanging over 2008 world economy...
4    Investors see housing as top U.S. election the...
Name: News, dtype: object

In [9]:
#get stop words of all languages
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}

In [10]:
#function to detect language based on # of stop words for particular language
def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    lang = max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0]
    if lang == 'english':
        return True
    else:
        return False

In [11]:
#filter for only english comments
eng_comments=comments[comments.apply(get_language)]

In [12]:
eng_comments[0]

"Credit market strains hang on banks results\n\nNEW YORK (Reuters) - Credit market strains may start to abate in early 2008 once banks get through the next round of revealing losses from riskier assets, fund managers at the Reuters Investment Outlook 2008 Summit said this week.\n\nCentral banks plan, unveiled on Wednesday, to add temporary reserves may also help to cushion the global banking system from shocks of more losses from the melt down in U.S. subprime mortgages. But for elevated short-term borrowing costs to come down substantially, market participants need a more up-to-date tally of how much red ink the banking system has bled, fund managers said.\n\nUntil then, the specter of uncertainty about banks unknown losses in credit markets will likely haunt banks and clam up lending. Banks financial results issued in early 2008 may be ugly, but investors will be relieved to know the worst.\n\nTom Sowanick, chief investment officer with Clearbrook Financial LLC in Princeton, New Jers

In [13]:
#drop duplicates
eng_comments.drop_duplicates(inplace=True)

In [14]:
#load spacy
nlp = spacy.load('en_core_web_sm')

In [15]:
#function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

In [16]:
eng_comments

0     Credit market strains hang on banks results\n\...
1     Dollar likely to weather any U.S. recession\n\...
2     Central bank plan delays day of reckoning\n\nN...
3     Inflation seen hanging over 2008 world economy...
4     Investors see housing as top U.S. election the...
5     2007 Market in Review: Burning Down the House\...
6     Fed Shrugged as Subprime Crisis Spread\n\nWASH...
7     Greenspan: subprime "accident waiting to happe...
8     Fed cuts rates by a quarter point\nBen Bernank...
9     Stocks End the Year With Only Modest Gains\n\n...
10    Maxjet collapses into bankruptcy\n\nMaxjet Air...
11    Bankers look to Fed to restore confidence\n\nE...
12    Lehman faces legal threat over CDO deals\n\nLe...
13    Japan big banks reluctant to pay for subprime ...
14    Bank of America shutting $12 billion cash fund...
15    The rating game: CDOs crash, banks may reveal ...
16    Housing meltdown hits US economy\n\nThe sudden...
17    Foreclosure wave sweeps America\n\nA wave 

In [17]:
#apply function to clean and lemmatize comments
lemmatized = eng_comments.map(clean_comments)

In [18]:
lemmatized

0     [credit, market, strain, hang, on, bank, resul...
1     [dollar, likely, to, weather, any, U, S,  , re...
2     [central, bank, plan, delay, day, of, reckon, ...
3     [inflation, see, hang, over, 2008, world, econ...
4     [investor, see, housing, as, top, U, S,  , ele...
5     [2007, market, in, Review,  , burn, down, the,...
6     [Fed, shrug, as, Subprime, Crisis, Spread,  , ...
7     [Greenspan,  , subprime,  , accident, wait, to...
8     [Fed, cut, rate, by, a, quarter, point, Ben, B...
9     [stock, end, the, Year, with, only, Modest, Ga...
10    [maxjet, collapse, into, bankruptcy,  , Maxjet...
11    [banker, look, to, Fed, to, restore, confidenc...
12    [Lehman, face, legal, threat, over, CDO, deal,...
13    [Japan, big, bank, reluctant, to, pay, for, su...
14    [Bank, of, America, shut,  , 12, billion, cash...
15    [the, rating, game,  , CDOs, crash,  , bank, m...
16    [housing, meltdown, hit, US, economy,  , the, ...
17    [foreclosure, wave, sweep, America,  , a, 

In [19]:
#make sure to lowercase everything
lemmatized = lemmatized.map(lambda x: [word.lower() for word in x])

In [20]:
lemmatized.head()

0    [credit, market, strain, hang, on, bank, resul...
1    [dollar, likely, to, weather, any, u, s,  , re...
2    [central, bank, plan, delay, day, of, reckon, ...
3    [inflation, see, hang, over, 2008, world, econ...
4    [investor, see, housing, as, top, u, s,  , ele...
Name: News, dtype: object

In [21]:
#turn all comments' tokens into one single list
unlist_comments = [item for items in lemmatized for item in items]

In [22]:
uni_freq = nltk.FreqDist(nltk.ngrams(unlist_comments, 1))
unigramFreqTable = pd.DataFrame(list(uni_freq.items()), columns=['unigram','freq']).sort_values(by='freq', ascending=False)
unigramFreqTable

Unnamed: 0,unigram,freq
7,"( ,)",1646
22,"(the,)",1223
40,"( ,)",643
97,"(be,)",594
25,"(of,)",580
...,...,...
874,"(campaign,)",1
1913,"(creditor,)",1
1912,"(filing,)",1
1911,"(unsuccessful,)",1


In [30]:
bi_freq = nltk.FreqDist(nltk.ngrams(unlist_comments, 2))
bigramFreqTable = pd.DataFrame(list(bi_freq.items()), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
bigramFreqTable = bigramFreqTable.reset_index(drop=True)
bigramFreqTable

Unnamed: 0,bigram,freq
0,"(of, the)",125
1,"(in, the)",122
2,"( , the)",109
3,"( , the)",89
4,"( , and)",79
...,...,...
13900,"(will, stabilize)",1
13901,"(stabilize, the)",1
13902,"(the, now)",1
13903,"(now, uncertain)",1


## Initialize NLTK's Bigrams/Trigrams Finder

In [22]:
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

In [23]:
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_comments)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_comments)

## 1. Counting Frequencies of Adjacent Words
- Main idea: simply order by frequency
- Issues: too sensitive to very frequent pairs and pronouns/articles/prepositions come up often
- Solution: filter for only adjectives and nouns

In [24]:
bigram_freq = bigramFinder.ngram_fd.items()

In [25]:
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [26]:
bigramFreqTable.reset_index(drop=True)

Unnamed: 0,bigram,freq
0,"(of, the)",125
1,"(in, the)",122
2,"( , the)",109
3,"( , the)",89
4,"( , and)",79
...,...,...
13900,"(will, stabilize)",1
13901,"(stabilize, the)",1
13902,"(the, now)",1
13903,"(now, uncertain)",1


In [27]:
bigramFreqTable[:10]

Unnamed: 0,bigram,freq
192,"(of, the)",125
327,"(in, the)",122
119,"( , the)",109
484,"( , the)",89
346,"( , and)",79
1279,"(the, fed)",59
79,"(s, )",50
171,"( , say)",48
78,"(u, s)",46
432,"(have, be)",46


In [28]:
#get english stopwords
en_stopwords = set(stopwords.words('english'))

In [29]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [30]:
#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [31]:
filtered_bi[:20]

Unnamed: 0,bigram,freq
599,"(interest, rate)",29
8,"(new, york)",28
649,"(housing, market)",25
9355,"(sub, prime)",23
1671,"(wall, street)",23
81,"(subprime, mortgage)",22
48,"(central, bank)",22
0,"(credit, market)",19
1238,"(federal, reserve)",17
1954,"(rate, cut)",16


In [32]:
trigram_freq = trigramFinder.ngram_fd.items()

In [33]:
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

In [34]:
trigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,trigram,freq
0,"(u, s, )",46
1,"(the, u, s)",18
2,"( , he, say)",15
3,"( , charts, )",15
4,"(he, say, )",14


In [35]:
trigramFreqTable[:10]

Unnamed: 0,trigram,freq
80,"(u, s, )",46
437,"(the, u, s)",18
1194,"( , he, say)",15
16801,"( , charts, )",15
1195,"(he, say, )",14
1439,"(the, federal, reserve)",14
16864,"( , fortune, 500)",13
16863,"(charts, , fortune)",13
4779,"(mr, , greenspan)",12
730,"(the, housing, market)",12


In [36]:
def rightTypesTri(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or '  ' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [37]:
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypesTri(x))]

In [38]:
filtered_tri[:20]

Unnamed: 0,trigram,freq
5850,"(mortgage, back, security)",10
13047,"(sub, prime, mortgage)",8
640,"(collateralized, debt, obligation)",6
167,"(chief, investment, officer)",6
353,"(adjustable, rate, mortgage)",6
14354,"(sub, prime, lending)",5
3041,"(subprime, mortgage, crisis)",5
7493,"(cnnmoney, com, )",5
382,"(money, market, fund)",5
1503,"(quarter, percentage, point)",4


In [39]:
freq_bi = filtered_bi[:20].bigram.values

In [40]:
freq_tri = filtered_tri[:20].trigram.values

## 2. PMI

In [41]:
bigramFinder.apply_freq_filter(0)

In [42]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [43]:
filtered_bi_PMI = bigramPMITable[bigramPMITable.bigram.map(lambda x: rightTypes(x))]

In [45]:
filtered_bi_PMI[:20]

Unnamed: 0,bigram,PMI
92,"(reuben, brother)",14.498475
89,"(red, ink)",14.498475
88,"(recovery, thank)",14.498475
86,"(protest, tactic)",14.498475
85,"(premium, cabin)",14.498475
82,"(pce, deflator)",14.498475
81,"(owner, occupier)",14.498475
80,"(owen, fitzpatrick)",14.498475
78,"(nsw, ministerial)",14.498475
77,"(northern, rock)",14.498475


In [65]:
trigramFinder.apply_freq_filter(0)

In [66]:
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [67]:
filtered_tri_PMI = trigramPMITable[trigramPMITable.trigram.map(lambda x: rightTypesTri(x))]

In [68]:
filtered_tri_PMI[:20]

Unnamed: 0,trigram,PMI
0,"(cnnmoney, com, )",23.353094
1,"(collateralized, debt, obligation)",21.382241
2,"(chief, investment, officer)",18.888426
4,"(sub, prime, lending)",17.050483
7,"(sub, prime, mortgage)",15.854086
9,"(mortgage, back, security)",15.542034
10,"(adjustable, rate, mortgage)",14.596961
16,"(money, market, fund)",13.359318
21,"(subprime, mortgage, crisis)",12.363842


In [50]:
pmi_bi = bigramPMITable[:20].bigram.values

In [51]:
pmi_tri = trigramPMITable[:20].trigram.values

## 3. t-test

In [52]:
bigramTtable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)

In [53]:
bigramTtable.head()

Unnamed: 0,bigram,t
0,"(in, the)",8.878312
1,"(of, the)",8.439249
2,"(the, fed)",7.144585
3,"(u, s)",6.740135
4,"( , which)",6.061542


In [54]:
filteredT_bi = bigramTtable[bigramTtable.bigram.map(lambda x: rightTypes(x))]

In [55]:
filteredT_bi[:10]

Unnamed: 0,bigram,t
10,"(interest, rate)",5.349688
11,"(new, york)",5.276871
17,"(housing, market)",4.919978
18,"(wall, street)",4.790652
19,"(sub, prime)",4.790444
22,"(central, bank)",4.66117
23,"(subprime, mortgage)",4.596794


In [56]:
trigramTtable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.student_t)), columns=['trigram','t']).sort_values(by='t', ascending=False)

In [57]:
trigramTtable.head()

Unnamed: 0,trigram,t
0,"(u, s, )",6.779329


In [58]:
filteredT_tri = trigramTtable[trigramTtable.trigram.map(lambda x: rightTypesTri(x))]

In [59]:
filteredT_tri.head(10)

Unnamed: 0,trigram,t


In [60]:
t_bi = filteredT_bi[:20].bigram.values

In [61]:
t_tri = filteredT_tri[:20].trigram.values

## 4. Chi-Square

In [62]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [63]:
bigramChiTable.head(20)

Unnamed: 0,bigram,chi-sq
0,"(wall, street)",21292.47817
1,"(sub, prime)",20472.651206
2,"(new, york)",10110.605924
3,"(u, s)",7362.493215
4,"(interest, rate)",4375.758444
5,"(central, bank)",3507.5486
6,"(housing, market)",1526.282783
7,"(he, say)",1414.836252
8,"(subprime, mortgage)",1068.069036
9,"(the, fed)",774.110317


In [64]:
trigramChiTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.chi_sq)), columns=['trigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [65]:
trigramChiTable.head(20)

Unnamed: 0,trigram,chi-sq
0,"(u, s, )",103942.844586


In [66]:
chi_bi = bigramChiTable[:20].bigram.values

In [67]:
chi_tri = trigramChiTable[:20].trigram.values

## 5. Likelihood

In [68]:
bigramLikTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [69]:
bigramLikTable.head()

Unnamed: 0,bigram,likelihood ratio
0,"(u, s)",483.817023
1,"(wall, street)",350.086489
2,"(sub, prime)",345.428345
3,"(new, york)",344.41877
4,"(interest, rate)",281.210175


In [70]:
filteredLik_bi = bigramLikTable[bigramLikTable.bigram.map(lambda x: rightTypes(x))]

In [71]:
filteredLik_bi.head(10)

Unnamed: 0,bigram,likelihood ratio
1,"(wall, street)",350.086489
2,"(sub, prime)",345.428345
3,"(new, york)",344.41877
4,"(interest, rate)",281.210175
7,"(central, bank)",214.749694
10,"(housing, market)",173.213583
13,"(subprime, mortgage)",139.338942


In [72]:
trigramLikTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [73]:
trigramLikTable.head()

Unnamed: 0,trigram,likelihood ratio
0,"(u, s, )",1094.486137


In [74]:
filteredLik_tri = trigramLikTable[trigramLikTable.trigram.map(lambda x: rightTypesTri(x))]

In [75]:
filteredLik_tri.head(20)

Unnamed: 0,trigram,likelihood ratio


In [76]:
lik_bi = filteredLik_bi[:20].bigram.values

In [77]:
lik_tri = filteredLik_tri[:20].trigram.values

## Bigram Comparison

In [78]:
bigramsCompare = pd.DataFrame([freq_bi, pmi_bi, t_bi, chi_bi, lik_bi]).T

In [79]:
bigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [80]:
bigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(interest, rate)","(wall, street)","(interest, rate)","(wall, street)","(wall, street)"
1,"(new, york)","(sub, prime)","(new, york)","(sub, prime)","(sub, prime)"
2,"(housing, market)","(new, york)","(housing, market)","(new, york)","(new, york)"
3,"(sub, prime)","(u, s)","(wall, street)","(u, s)","(interest, rate)"
4,"(wall, street)","(central, bank)","(sub, prime)","(interest, rate)","(central, bank)"
5,"(subprime, mortgage)","(interest, rate)","(central, bank)","(central, bank)","(housing, market)"
6,"(central, bank)","(housing, market)","(subprime, mortgage)","(housing, market)","(subprime, mortgage)"
7,"(credit, market)","(he, say)",,"(he, say)",
8,"(federal, reserve)","(subprime, mortgage)",,"(subprime, mortgage)",
9,"(rate, cut)","(there, be)",,"(the, fed)",


## Trigram Comparison

In [81]:
trigramsCompare = pd.DataFrame([freq_tri, pmi_tri, t_tri, chi_tri, lik_tri]).T

In [82]:
trigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [83]:
trigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(mortgage, back, security)","(u, s, )",,"(u, s, )",
1,"(sub, prime, mortgage)",,,,
2,"(collateralized, debt, obligation)",,,,
3,"(chief, investment, officer)",,,,
4,"(adjustable, rate, mortgage)",,,,
5,"(sub, prime, lending)",,,,
6,"(subprime, mortgage, crisis)",,,,
7,"(cnnmoney, com, )",,,,
8,"(money, market, fund)",,,,
9,"(quarter, percentage, point)",,,,
