# Collocations

In [None]:
#load all libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string
import datasets

Data: https://www.kaggle.com/datafiniti/hotel-reviews/data

In [3]:
#load reviews data
aw = datasets.load_dataset('amazon_us_reviews', 'Electronics_v1_00')['train']

In [5]:
print(aw[0])
print(aw.shape)

{'marketplace': 'US', 'customer_id': '41409413', 'review_id': 'R2MTG1GCZLR2DK', 'product_id': 'B00428R89M', 'product_parent': '112201306', 'product_title': 'yoomall 5M Antenna WIFI RP-SMA Female to Male Extensionl Cable', 'product_category': 'Electronics', 'star_rating': 5, 'helpful_votes': 0, 'total_votes': 0, 'vine': 0, 'verified_purchase': 1, 'review_headline': 'Five Stars', 'review_body': 'As described.', 'review_date': '2015-08-31'}
(3093869, 15)


Extract only the reviews...

In [14]:
comments = aw.select(range(30000))['review_body']
print(comments[:10])

['As described.', 'It works as advertising.', 'Works pissa', 'Did not work at all.', 'Works well. Bass is somewhat lacking but is present. Overall pleased with the item.', "The quality on these speakers is insanely good and doesn't sound muddy when adjusting bass. Very happy with these.", 'Wish I could give this product more than five stars. Lifesaver.', 'works great', 'Great sound and compact. Battery life seems good. Happy with this product.', 'It works well~~~']


## Preprocessing

In [15]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [16]:
print(aw[4]['review_body'])
print(comments[4])

Works well. Bass is somewhat lacking but is present. Overall pleased with the item.
Works well. Bass is somewhat lacking but is present. Overall pleased with the item.


In [18]:
#remove non-ascii characters
comments = [_removeNonAscii(x) for x in comments]

In [20]:
#get stop words of all languages
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}

In [23]:
#load spacy
nlp = spacy.load('en_core_web_sm')

In [24]:
#function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

In [25]:
#apply function to clean and lemmatize comments
lemmatized = [clean_comments(x) for x in comments]

In [26]:
print(lemmatized[:5])

[['as', 'describe'], ['it', 'work', 'as', 'advertising'], ['work', 'pissa'], ['do', 'not', 'work', 'at', 'all'], ['work', 'well', ' ', 'Bass', 'be', 'somewhat', 'lacking', 'but', 'be', 'present', ' ', 'Overall', 'pleased', 'with', 'the', 'item']]


In [27]:
#make sure to lowercase everything
lemmatized = [[word.lower() for word in x] for x in lemmatized]

In [None]:
lemmatized[:5]

[['as', 'describe'],
 ['it', 'work', 'as', 'advertising'],
 ['work', 'pissa'],
 ['do', 'not', 'work', 'at', 'all'],
 ['work',
  'well',
  ' ',
  'bass',
  'be',
  'somewhat',
  'lacking',
  'but',
  'be',
  'present',
  ' ',
  'overall',
  'pleased',
  'with',
  'the',
  'item']]

In [30]:
#turn all comments' tokens into one single list
unlist_comments = [item for items in lemmatized for item in items]

## Initialize NLTK's Bigrams/Trigrams Finder

In [31]:
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

In [32]:
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_comments)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_comments)

## 1. Counting Frequencies of Adjacent Words
- Main idea: simply order by frequency
- Issues: too sensitive to very frequent pairs and pronouns/articles/prepositions come up often
- Solution: filter for only adjectives and nouns

In [33]:
bigram_freq = bigramFinder.ngram_fd.items()

In [34]:
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [35]:
bigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,bigram,freq
0,"( , i)",11580
1,"(br, )",8025
2,"( , br)",7101
3,"( , the)",6341
4,"(br, )",5838


In [36]:
bigramFreqTable[:10]

Unnamed: 0,bigram,freq
210,"( , i)",11580
192,"(br, )",8025
191,"( , br)",7101
1276,"( , the)",6341
290,"(br, )",5838
93,"( , and)",5481
291,"( , br)",5450
358,"( , i)",5315
269,"( , but)",5170
726,"( , it)",4726


In [37]:
#get english stopwords
en_stopwords = set(stopwords.words('english'))

In [38]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [39]:
#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [42]:
filtered_bi[:30]

Unnamed: 0,bigram,freq
192,"(br, )",8025
290,"(br, )",5838
291,"( , br)",5450
1824,"(sound, quality)",1487
64,"(great, sound)",736
440,"(great, product)",730
9956,"(br, )",612
434,"(work, fine)",497
1915,"(good, sound)",491
6036,"(good, quality)",463


In [43]:
trigram_freq = trigramFinder.ngram_fd.items()

In [44]:
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

In [45]:
trigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,trigram,freq
0,"(br, , br)",5322
1,"( , br, )",4987
2,"( , br, )",4797
3,"( , br, )",1931
4,"( , 34, )",1554


In [46]:
trigramFreqTable[:10]

Unnamed: 0,trigram,freq
310,"(br, , br)",5322
311,"( , br, )",4987
425,"( , br, )",4797
198,"( , br, )",1931
280,"( , 34, )",1554
510,"( , i, have)",1271
574,"(br, , i)",1251
338,"( , i, be)",1069
957,"(for, the, price)",1045
818,"( , it, be)",987


In [47]:
def rightTypesTri(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or '  ' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [48]:
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypesTri(x))]

In [50]:
filtered_tri[:50]

Unnamed: 0,trigram,freq
310,"(br, , br)",5322
449,"( , br, )",277
15059,"( , br, )",154
3320,"(good, sound, quality)",111
6961,"(great, sound, quality)",83
14027,"(great, little, speaker)",58
11437,"(product, work, great)",51
23010,"(micro, sd, card)",51
17263,"(worth, every, penny)",46
18656,"(work, great, work)",46


In [51]:
freq_bi = filtered_bi[:20].bigram.values

In [52]:
freq_tri = filtered_tri[:20].trigram.values

## 2. PMI

In [53]:
bigramFinder.apply_freq_filter(20)

In [54]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [56]:
bigramPMITable[:30]

Unnamed: 0,bigram,PMI
0,"(raspberry, pi)",15.263497
1,"(hip, hop)",15.059964
2,"(skull, candy)",14.912025
3,"(timely, manner)",14.512726
4,"(wi, fi)",14.502685
5,"(lithium, ion)",13.619641
6,"(v, moda)",13.389028
7,"(blu, ray)",13.366993
8,"(buyer, beware)",13.151668
9,"(double, sided)",12.883375


In [58]:
trigramFinder.apply_freq_filter(20)

In [59]:
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [60]:
trigramPMITable[:30]

Unnamed: 0,trigram,PMI
0,"(micro, sd, card)",23.254938
1,"(blu, ray, player)",22.505667
2,"(worth, every, penny)",21.462817
3,"(bose, soundlink, mini)",21.155696
4,"(home, theater, system)",19.601036
5,"(http, , www)",19.257324
6,"(an, e, book)",19.050849
7,"(3, 5, mm)",18.987777
8,"(exceed, my, expectation)",18.627924
9,"(iphone, 6, plus)",18.480128


In [63]:
pmi_bi = bigramPMITable[:20].bigram.values

In [64]:
pmi_tri = trigramPMITable[:20].trigram.values

## 3. t-test

In [65]:
bigramTtable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)

In [66]:
bigramTtable.head()

Unnamed: 0,bigram,t
0,"(br, )",85.689633
1,"( , i)",78.877445
2,"(br, )",75.196321
3,"( , br)",72.571283
4,"( , br)",71.79571


In [67]:
filteredT_bi = bigramTtable[bigramTtable.bigram.map(lambda x: rightTypes(x))]

In [68]:
filteredT_bi[:10]

Unnamed: 0,bigram,t
0,"(br, )",85.689633
2,"(br, )",75.196321
3,"( , br)",72.571283
28,"(sound, quality)",37.824835
90,"(great, product)",25.818652
99,"(great, sound)",24.953494
111,"(br, )",24.144028
145,"(work, fine)",21.83281
156,"(battery, life)",21.216578
169,"(good, quality)",20.299661


In [69]:
trigramTtable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.student_t)), columns=['trigram','t']).sort_values(by='t', ascending=False)

In [70]:
trigramTtable.head()

Unnamed: 0,trigram,t
0,"(br, , br)",72.938924
1,"( , br, )",70.587515
2,"( , br, )",69.164568
3,"( , br, )",43.373776
4,"( , 34, )",38.941694


In [71]:
filteredT_tri = trigramTtable[trigramTtable.trigram.map(lambda x: rightTypesTri(x))]

In [72]:
filteredT_tri.head(10)

Unnamed: 0,trigram,t
0,"(br, , br)",72.938924
80,"( , br, )",16.608229
221,"( , br, )",12.402189
373,"(good, sound, quality)",10.520428
609,"(great, sound, quality)",9.089198
1080,"(great, little, speaker)",7.608831
1294,"(micro, sd, card)",7.141428
1322,"(product, work, great)",7.104456
1522,"(worth, every, penny)",6.782328
1600,"(work, great, work)",6.688027


In [90]:
t_bi = filteredT_bi[:20].bigram.values

In [91]:
t_tri = filteredT_tri[:20].trigram.values

## 4. Chi-Square

In [73]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [74]:
bigramChiTable.head(20)

Unnamed: 0,bigram,chi-sq
0,"(hip, hop)",1127229.0
1,"(raspberry, pi)",1022683.0
2,"(blu, ray)",1003637.0
3,"(skull, candy)",894044.3
4,"(surge, protector)",696353.5
5,"(timely, manner)",607752.3
6,"(wi, fi)",580326.2
7,"(sd, card)",470824.1
8,"(blue, tooth)",402280.1
9,"(customer, service)",378604.4


In [75]:
trigramChiTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.chi_sq)), columns=['trigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [76]:
trigramChiTable.head(20)

Unnamed: 0,trigram,chi-sq
0,"(micro, sd, card)",510678900.0
1,"(blu, ray, player)",214769400.0
2,"(worth, every, penny)",132956600.0
3,"(3, 5, mm)",77982570.0
4,"(bose, soundlink, mini)",63108110.0
5,"(br, , br)",29603630.0
6,"(http, , www)",22560440.0
7,"(exceed, my, expectation)",17827670.0
8,"(home, theater, system)",17602000.0
9,"(an, e, book)",16305600.0


In [77]:
chi_bi = bigramChiTable[:20].bigram.values

In [78]:
chi_tri = trigramChiTable[:20].trigram.values

## 5. Likelihood

In [79]:
bigramLikTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [80]:
bigramLikTable.head()

Unnamed: 0,bigram,likelihood ratio
0,"(br, )",44562.478071
1,"(br, )",42101.448279
2,"( , br)",40229.892099
3,"(don, t)",23078.896974
4,"( , br)",18647.169116


In [81]:
filteredLik_bi = bigramLikTable[bigramLikTable.bigram.map(lambda x: rightTypes(x))]

In [82]:
filteredLik_bi.head(10)

Unnamed: 0,bigram,likelihood ratio
0,"(br, )",44562.478071
1,"(br, )",42101.448279
2,"( , br)",40229.892099
18,"(sound, quality)",9635.285105
43,"(battery, life)",4793.254238
74,"(ear, bud)",3733.208953
76,"(br, )",3693.450715
90,"(great, product)",3312.378407
100,"(work, fine)",3123.905999
110,"(customer, service)",2976.754797


In [83]:
trigramLikTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [84]:
trigramLikTable.head()

Unnamed: 0,trigram,likelihood ratio
0,"(br, , br)",92247.054718
1,"( , br, )",90471.676319
2,"( , br, )",87667.472912
3,"( , br, )",67904.527773
4,"( , br, )",65552.524632


In [85]:
filteredLik_tri = trigramLikTable[trigramLikTable.trigram.map(lambda x: rightTypesTri(x))]

In [86]:
filteredLik_tri.head(20)

Unnamed: 0,trigram,likelihood ratio
0,"(br, , br)",92247.054718
1,"( , br, )",90471.676319
10,"( , br, )",44629.507339
11,"( , br, )",43978.280525
696,"(great, sound, quality)",12147.760874
738,"(product, work, great)",11271.415116
750,"(good, sound, quality)",11126.084836
761,"(price, work, great)",11040.804026
763,"(work, great, great)",10990.608007
764,"(great, work, great)",10985.382516


In [87]:
lik_bi = filteredLik_bi[:20].bigram.values

In [88]:
lik_tri = filteredLik_tri[:20].trigram.values

## Bigram Comparison

In [92]:
bigramsCompare = pd.DataFrame([freq_bi, pmi_bi, t_bi, chi_bi, lik_bi]).T

In [93]:
bigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [94]:
bigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(br, )","(raspberry, pi)","(br, )","(hip, hop)","(br, )"
1,"(br, )","(hip, hop)","(br, )","(raspberry, pi)","(br, )"
2,"( , br)","(skull, candy)","( , br)","(blu, ray)","( , br)"
3,"(sound, quality)","(timely, manner)","(sound, quality)","(skull, candy)","(sound, quality)"
4,"(great, sound)","(wi, fi)","(great, product)","(surge, protector)","(battery, life)"
5,"(great, product)","(lithium, ion)","(great, sound)","(timely, manner)","(ear, bud)"
6,"(br, )","(v, moda)","(br, )","(wi, fi)","(br, )"
7,"(work, fine)","(blu, ray)","(work, fine)","(sd, card)","(great, product)"
8,"(good, sound)","(buyer, beware)","(battery, life)","(blue, tooth)","(work, fine)"
9,"(good, quality)","(double, sided)","(good, quality)","(customer, service)","(customer, service)"


## Trigram Comparison

In [95]:
trigramsCompare = pd.DataFrame([freq_tri, pmi_tri, t_tri, chi_tri, lik_tri]).T

In [96]:
trigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [97]:
trigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(br, , br)","(micro, sd, card)","(br, , br)","(micro, sd, card)","(br, , br)"
1,"( , br, )","(blu, ray, player)","( , br, )","(blu, ray, player)","( , br, )"
2,"( , br, )","(worth, every, penny)","( , br, )","(worth, every, penny)","( , br, )"
3,"(good, sound, quality)","(bose, soundlink, mini)","(good, sound, quality)","(3, 5, mm)","( , br, )"
4,"(great, sound, quality)","(home, theater, system)","(great, sound, quality)","(bose, soundlink, mini)","(great, sound, quality)"
5,"(great, little, speaker)","(http, , www)","(great, little, speaker)","(br, , br)","(product, work, great)"
6,"(product, work, great)","(an, e, book)","(micro, sd, card)","(http, , www)","(good, sound, quality)"
7,"(micro, sd, card)","(3, 5, mm)","(product, work, great)","(exceed, my, expectation)","(price, work, great)"
8,"(worth, every, penny)","(exceed, my, expectation)","(worth, every, penny)","(home, theater, system)","(work, great, great)"
9,"(work, great, work)","(iphone, 6, plus)","(work, great, work)","(an, e, book)","(great, work, great)"
