In [90]:
import pandas as pd
import numpy as np
import spacy
import re
import string
import nltk
from nltk.corpus import stopwords


In [91]:
#load reviews data
reviews = pd.read_csv('/mnt/d/hotel-reviews/7282_1.csv',)
#extract only reviews
comments = reviews['reviews.text']
comments = comments.astype('str')

In [92]:
reviews.head()

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,������ ���������������,,sungchul,


In [93]:
#function to remove non-ascii characters
def _removeNonAscii(s): 
    return "".join(i for i in s if ord(i)<128)

#remove non-ascii characters
comments = comments.map(lambda x: _removeNonAscii(x))
comments[:10]

0    Pleasant 10 min walk along the sea front to th...
1    Really lovely hotel. Stayed on the very top fl...
2    Ett mycket bra hotell. Det som drog ner betyge...
3    We stayed here for four nights in October. The...
4    We stayed here for four nights in October. The...
5    We loved staying on the island of Lido! You ne...
6    Lovely view out onto the lagoon. Excellent vie...
7    ottimo soggiorno e ottima sistemazione nei gio...
8    Gnstiger Ausgangspunkt fr Venedig Besuche. Ruh...
9    Lidoen er perfekt til et par dages ro og afsla...
Name: reviews.text, dtype: object

In [94]:
#get stop words of all languages
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}


In [95]:
#function to detect language based on # of stop words for particular language
def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    lang = max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0]
    if lang == 'english':
        return True
    else:
        return False


In [96]:
#filter for only english comments
eng_comments=comments[comments.apply(get_language)]

#drop duplicates
eng_comments.drop_duplicates(inplace=True)



In [97]:
#load spacy
nlp = spacy.load('en')

#function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

#apply function to clean and lemmatize comments
lemmatized = eng_comments.map(clean_comments)


In [98]:
#make sure to lowercase everything
lemmatized = lemmatized.map(lambda x: [word.lower() for word in x])

#turn all comments' tokens into one single list
unlist_comments = [item for items in lemmatized for item in items]

In [99]:
unlist_comments[:10]

['pleasant', '10', 'min', 'walk', 'along', 'the', 'sea', 'front', 'to', 'the']


### Initialize NLTK's Bigrams/Trigrams Finder


In [100]:
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_comments)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_comments)

### 1. Counting Frequencies of Adjacent Words
    . Main idea: simply order by frequency
    . Issues: too sensitive to very frequent pairs and pronouns/articles/prepositions come up often
    . Solution: filter for only adjectives and nouns

In [101]:
#bigrams
bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
bigramFreqTable.head().reset_index(drop=True)


#trigrams
trigram_freq = trigramFinder.ngram_fd.items()
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)
trigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,trigram,freq
0,"( , -pron-, be)",6264
1,"(the, room, be)",4412
2,"( , the, room)",3350
3,"( , -pron-, have)",2684
4,"(the, staff, be)",2641


In [102]:
trigramFreqTable[:10]

Unnamed: 0,trigram,freq
2577,"( , -pron-, be)",6264
114,"(the, room, be)",4412
113,"( , the, room)",3350
1005,"( , -pron-, have)",2684
1454,"(the, staff, be)",2641
682,"(the, hotel, be)",2323
2600,"( , there, be)",2181
666,"( , the, staff)",1928
266,"(-pron-, have, a)",1835
1320,"(the, front, desk)",1826


In [103]:
bigramFreqTable[:10]

Unnamed: 0,bigram,freq
152,"( , -pron-)",28938
93,"( , the)",21918
58,"(-pron-, be)",18345
108,"(the, room)",8899
238,"(-pron-, have)",8379
109,"(room, be)",8301
323,"(in, the)",8149
190,"(be, very)",7708
248,"(be, a)",7263
237,"(and, -pron-)",7000


However, a common issue with this is adjacent spaces, stop words, articles, prepositions or pronouns are common and are not meaningful.

To fix this, we filter out for collocations not containing stop words and filter for only the following structures:
        
            Bigrams: (Noun, Noun), (Adjective, Noun)
            
            Trigrams: (Adjective/Noun, Anything, Adjective/Noun)

In [104]:
#get english stopwords
en_stopwords = set(stopwords.words('english'))

#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [105]:
#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [106]:
#function to filter for trigrams
def rightTypesTri(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords or word.isspace():
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

    #filter trigrams
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypesTri(x))]

In [107]:
filtered_bi[:10]

Unnamed: 0,bigram,freq
1087,"(front, desk)",2673
73,"(great, location)",797
270,"(friendly, staff)",775
5163,"(hot, tub)",634
4735,"(clean, room)",625
95,"(hotel, staff)",539
3099,"(continental, breakfast)",531
266,"(nice, hotel)",529
4081,"(free, breakfast)",522
1908,"(great, place)",514


In [108]:
filtered_tri[:10]

Unnamed: 0,trigram,freq
3376,"(front, desk, staff)",384
24561,"(non, smoking, room)",231
34545,"(holiday, inn, express)",138
10221,"(front, desk, clerk)",122
12305,"(flat, screen, tv)",79
28495,"(smell, like, smoke)",73
141472,"(old, town, alexandria)",69
18387,"(front, desk, person)",65
6089,"(free, wi, fi)",62
56392,"(great, customer, service)",54


In [109]:
freq_bi = filtered_bi[:20].bigram.values
freq_bi


array([('front', 'desk'), ('great', 'location'), ('friendly', 'staff'),
       ('hot', 'tub'), ('clean', 'room'), ('hotel', 'staff'),
       ('continental', 'breakfast'), ('nice', 'hotel'),
       ('free', 'breakfast'), ('great', 'place'), ('walk', 'distance'),
       ('desk', 'staff'), ('parking', 'lot'), ('customer', 'service'),
       ('comfortable', 'bed'), ('next', 'door'), ('smoking', 'room'),
       ('pool', 'area'), ('good', 'location'), ('great', 'hotel')],
      dtype=object)

In [110]:
freq_tri = filtered_tri[:20].trigram.values
freq_tri


array([('front', 'desk', 'staff'), ('non', 'smoking', 'room'),
       ('holiday', 'inn', 'express'), ('front', 'desk', 'clerk'),
       ('flat', 'screen', 'tv'), ('smell', 'like', 'smoke'),
       ('old', 'town', 'alexandria'), ('front', 'desk', 'person'),
       ('free', 'wi', 'fi'), ('great', 'customer', 'service'),
       ('good', 'night', 'sleep'), ('front', 'desk', 'people'),
       ('red', 'roof', 'inn'), ('easy', 'walking', 'distance'),
       ('first', 'time', 'stay'), ('second', 'time', 'stay'),
       ('elk', 'springs', 'resort'), ('restaurant', 'within', 'walk'),
       ('call', 'front', 'desk'), ('pet', 'friendly', 'hotel')],
      dtype=object)

## 2. Pointwise Mutual Information

The main intuition is that it measures how much more likely the words co-occur than if they were independent. However, it is very sensitive to rare combination of words. For example, if a random bigram ‘abc xyz’ appears, and neither ‘abc’ nor ‘xyz’ appeared anywhere else in the text, ‘abc xyz’ will be identified as highly significant bigram when it could just be a random misspelling or a phrase too rare to generalize as a bigram. Therefore, this method is often used with a frequency filter.

In [111]:
#filter for only those with more than 20 occurences
bigramFinder.apply_freq_filter(20)
trigramFinder.apply_freq_filter(20)

bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [112]:
bigramPMITable[:10]

Unnamed: 0,bigram,PMI
0,"(universal, studios)",15.201158
1,"(howard, johnson)",14.954654
2,"(cracker, barrel)",14.811135
3,"(santa, barbara)",14.521901
4,"(sub, par)",14.088264
5,"(santana, row)",14.001433
6,"(ironing, board)",13.824037
7,"(e, g)",13.687617
8,"(elk, springs)",13.31005
9,"(times, square)",13.184889


In [113]:
trigramPMITable[:10]

Unnamed: 0,trigram,PMI
0,"(elk, springs, resort)",23.964671
1,"(zion, national, park)",23.151439
2,"(flat, screen, tv)",22.605577
3,"(hard, boil, egg)",22.018144
4,"(holiday, inn, express)",21.65115
5,"(red, roof, inn)",21.450096
6,"(simpson, house, inn)",20.80283
7,"(free, wi, fi)",20.634088
8,"(slide, glass, door)",20.361876
9,"(air, conditioning, unit)",19.768001


In [114]:
pmi_bi = bigramPMITable[:20].bigram.values
pmi_bi


array([('universal', 'studios'), ('howard', 'johnson'),
       ('cracker', 'barrel'), ('santa', 'barbara'), ('sub', 'par'),
       ('santana', 'row'), ('ironing', 'board'), ('e', 'g'),
       ('elk', 'springs'), ('times', 'square'), ('ear', 'plug'),
       ('la', 'quinta'), ('fire', 'pit'), ('san', 'francisco'),
       ('san', 'pedro'), ('san', 'clemente'), ('san', 'diego'),
       ('french', 'quarter'), ('wi', 'fi'), ('colorado', 'springs')],
      dtype=object)

In [115]:
pmi_tri = trigramPMITable[:20].trigram.values
pmi_tri


array([('elk', 'springs', 'resort'), ('zion', 'national', 'park'),
       ('flat', 'screen', 'tv'), ('hard', 'boil', 'egg'),
       ('holiday', 'inn', 'express'), ('red', 'roof', 'inn'),
       ('simpson', 'house', 'inn'), ('free', 'wi', 'fi'),
       ('slide', 'glass', 'door'), ('air', 'conditioning', 'unit'),
       ('within', 'walking', 'distance'), ('within', 'walk', 'distance'),
       ('pleasantly', 'surprised', 'by'), ('fire', 'alarm', 'go'),
       ('hilton', 'garden', 'inn'), ('pull', 'out', 'couch'),
       ('easy', 'walking', 'distance'),
       ('conveniently', 'locate', 'near'), ('of', 'times', 'square'),
       ('old', 'town', 'alexandria')], dtype=object)

### 3. t-test

In [116]:
bigramTtable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)
trigramTtable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.student_t)), columns=['trigram','t']).sort_values(by='t', ascending=False)


In [117]:
bigramTtable.head()

Unnamed: 0,bigram,t
0,"( , -pron-)",112.199471
1,"( , the)",92.17071
2,"(-pron-, be)",89.878098
3,"(the, room)",79.387235
4,"(be, very)",78.010205


In [118]:
trigramTtable.head()

Unnamed: 0,trigram,t
0,"( , -pron-, be)",72.497451
1,"(the, room, be)",65.289115
2,"( , the, room)",55.802215
3,"(the, staff, be)",50.822793
4,"( , -pron-, have)",49.544729


In [119]:

#filters
filteredT_bi = bigramTtable[bigramTtable.bigram.map(lambda x: rightTypes(x))]
filteredT_tri = trigramTtable[trigramTtable.trigram.map(lambda x: rightTypesTri(x))]

In [120]:
#Results are similar to the frequency count technique from 1.

In [123]:
filteredT_bi.head()

Unnamed: 0,bigram,t
21,"(front, desk)",51.566725
108,"(great, location)",27.429738
118,"(friendly, staff)",26.735369
136,"(hot, tub)",25.13237
161,"(continental, breakfast)",22.919996


In [124]:
filteredT_tri.head()

Unnamed: 0,trigram,t
143,"(front, desk, staff)",19.593922
313,"(non, smoking, room)",15.198511
615,"(holiday, inn, express)",11.747337
726,"(front, desk, clerk)",11.045156
1249,"(flat, screen, tv)",8.888193


In [125]:
t_bi = filteredT_bi[:20].bigram.values
t_bi

array([('front', 'desk'), ('great', 'location'), ('friendly', 'staff'),
       ('hot', 'tub'), ('continental', 'breakfast'),
       ('free', 'breakfast'), ('walk', 'distance'), ('great', 'place'),
       ('parking', 'lot'), ('customer', 'service'), ('desk', 'staff'),
       ('comfortable', 'bed'), ('nice', 'hotel'), ('clean', 'room'),
       ('next', 'door'), ('hotel', 'staff'), ('smoking', 'room'),
       ('pool', 'area'), ('desk', 'clerk'), ('good', 'location')],
      dtype=object)

In [126]:
t_tri = filteredT_tri[:20].trigram.values
t_tri

array([('front', 'desk', 'staff'), ('non', 'smoking', 'room'),
       ('holiday', 'inn', 'express'), ('front', 'desk', 'clerk'),
       ('flat', 'screen', 'tv'), ('smell', 'like', 'smoke'),
       ('old', 'town', 'alexandria'), ('front', 'desk', 'person'),
       ('free', 'wi', 'fi'), ('great', 'customer', 'service'),
       ('good', 'night', 'sleep'), ('red', 'roof', 'inn'),
       ('front', 'desk', 'people'), ('easy', 'walking', 'distance'),
       ('first', 'time', 'stay'), ('elk', 'springs', 'resort'),
       ('second', 'time', 'stay'), ('restaurant', 'within', 'walk'),
       ('call', 'front', 'desk'), ('pet', 'friendly', 'hotel')],
      dtype=object)

T-test has been criticized as it assumes normal distribution. Therefore, we will also look into the chi-square test.

### 4.chi-square test

The chi-square test assumes in the null hypothesis that words are independent, just like in t-test. 

In [127]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [128]:
bigramChiTable.head()

Unnamed: 0,bigram,chi-sq
0,"(wi, fi)",1651669.0
1,"(cracker, barrel)",1322360.0
2,"(howard, johnson)",1206642.0
3,"(la, quinta)",1075724.0
4,"(front, desk)",1027472.0


In [130]:
trigramChiTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.chi_sq)), columns=['trigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

trigramChiTable.head(10)

Unnamed: 0,trigram,chi-sq
0,"(elk, springs, resort)",638565200.0
1,"(flat, screen, tv)",504262300.0
2,"(holiday, inn, express)",454542600.0
3,"(zion, national, park)",288854900.0
4,"(within, walk, distance)",187048400.0
5,"(red, roof, inn)",134803100.0
6,"(free, wi, fi)",101769000.0
7,"(hard, boil, egg)",93468060.0
8,"(simpson, house, inn)",45745010.0
9,"(slide, glass, door)",30993590.0


In [131]:
chi_bi = bigramChiTable[:20].bigram.values
chi_bi

array([('wi', 'fi'), ('cracker', 'barrel'), ('howard', 'johnson'),
       ('la', 'quinta'), ('front', 'desk'), ('universal', 'studios'),
       ('santa', 'barbara'), ('santana', 'row'),
       ('pleasantly', 'surprised'), ('   ', 'more'), ('flat', 'screen'),
       ('non', 'smoking'), ('french', 'quarter'), ('elk', 'springs'),
       ('didn', 't'), ('red', 'roof'), ('times', 'square'),
       ('ironing', 'board'), ('air', 'conditioning'),
       ('walking', 'distance')], dtype=object)

In [134]:
chi_tri = trigramChiTable[:20].trigram.values
chi_tri

array([('elk', 'springs', 'resort'), ('flat', 'screen', 'tv'),
       ('holiday', 'inn', 'express'), ('zion', 'national', 'park'),
       ('within', 'walk', 'distance'), ('red', 'roof', 'inn'),
       ('free', 'wi', 'fi'), ('hard', 'boil', 'egg'),
       ('simpson', 'house', 'inn'), ('slide', 'glass', 'door'),
       ('within', 'walking', 'distance'), ('old', 'town', 'alexandria'),
       ('non', 'smoking', 'room'), ('pleasantly', 'surprised', 'by'),
       ('easy', 'walking', 'distance'), ('air', 'conditioning', 'unit'),
       ('pull', 'out', 'couch'), ('king', 'size', 'bed'),
       ('fire', 'alarm', 'go'), ('biscuit', 'and', 'gravy')], dtype=object)

### 5. Likelihood

In [135]:
bigramLikTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

bigramLikTable.head()

Unnamed: 0,bigram,likelihood ratio
0,"( , more)",48445.57347
1,"(front, desk)",32450.306703
2,"( , -pron-)",30840.928186
3,"(didn, t)",26671.722928
4,"(be, very)",24360.903067


In [136]:
filteredLik_bi = bigramLikTable[bigramLikTable.bigram.map(lambda x: rightTypes(x))]

filteredLik_bi.head(10)

Unnamed: 0,bigram,likelihood ratio
1,"(front, desk)",32450.306703
52,"(hot, tub)",7391.537626
74,"(continental, breakfast)",5282.846246
75,"(walk, distance)",5279.258783
94,"(customer, service)",4510.025897
100,"(wi, fi)",4395.341231
101,"(great, location)",4347.051497
112,"(parking, lot)",3979.719987
123,"(friendly, staff)",3679.985965
124,"(air, conditioner)",3656.038455


In [137]:
trigramLikTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

trigramLikTable.head()

Unnamed: 0,trigram,likelihood ratio
0,"(the, room, be)",94863.997671
1,"( , -pron-, be)",85466.232801
2,"(the, staff, be)",77612.789481
3,"( , more, -pron-)",77201.272685
4,"( , -pron-, have)",76047.322288


In [138]:
filteredLik_tri = trigramLikTable[trigramLikTable.trigram.map(lambda x: rightTypesTri(x))]

filteredLik_tri.head(20)

Unnamed: 0,trigram,likelihood ratio
81,"(front, desk, clerk)",53383.310531
91,"(front, desk, staff)",51749.978683
113,"(front, desk, person)",49644.344093
121,"(front, desk, attendant)",49190.763451
125,"(front, desk, personnel)",49099.594095
128,"(call, front, desk)",49002.184649
129,"(front, desk, people)",48985.372694
130,"(front, desk, guy)",48969.748516
131,"(front, desk, lady)",48957.497454
155,"(hotel, front, desk)",48685.258845


In [140]:
lik_bi = filteredLik_bi[:20].bigram.values
lik_bi

array([('front', 'desk'), ('hot', 'tub'), ('continental', 'breakfast'),
       ('walk', 'distance'), ('customer', 'service'), ('wi', 'fi'),
       ('great', 'location'), ('parking', 'lot'), ('friendly', 'staff'),
       ('air', 'conditioner'), ('holiday', 'inn'),
       ('air', 'conditioning'), ('free', 'breakfast'), ('next', 'door'),
       ('desk', 'clerk'), ('easy', 'access'), ('hampton', 'inn'),
       ('la', 'quinta'), ('coffee', 'maker'), ('smoking', 'room')],
      dtype=object)

In [141]:
lik_tri = filteredLik_tri[:20].trigram.values
lik_tri

array([('front', 'desk', 'clerk'), ('front', 'desk', 'staff'),
       ('front', 'desk', 'person'), ('front', 'desk', 'attendant'),
       ('front', 'desk', 'personnel'), ('call', 'front', 'desk'),
       ('front', 'desk', 'people'), ('front', 'desk', 'guy'),
       ('front', 'desk', 'lady'), ('hotel', 'front', 'desk'),
       ('pool', 'hot', 'tub'), ('non', 'smoking', 'room'),
       ('holiday', 'inn', 'express'),
       ('free', 'continental', 'breakfast'), ('free', 'wi', 'fi'),
       ('great', 'customer', 'service'),
       ('excellent', 'customer', 'service'),
       ('good', 'customer', 'service'), ('super', 'friendly', 'staff'),
       ('easy', 'walking', 'distance')], dtype=object)

### Bigram Comparison

In [142]:
bigramsCompare = pd.DataFrame([freq_bi, pmi_bi, t_bi, chi_bi, lik_bi]).T

bigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

bigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(front, desk)","(universal, studios)","(front, desk)","(wi, fi)","(front, desk)"
1,"(great, location)","(howard, johnson)","(great, location)","(cracker, barrel)","(hot, tub)"
2,"(friendly, staff)","(cracker, barrel)","(friendly, staff)","(howard, johnson)","(continental, breakfast)"
3,"(hot, tub)","(santa, barbara)","(hot, tub)","(la, quinta)","(walk, distance)"
4,"(clean, room)","(sub, par)","(continental, breakfast)","(front, desk)","(customer, service)"
5,"(hotel, staff)","(santana, row)","(free, breakfast)","(universal, studios)","(wi, fi)"
6,"(continental, breakfast)","(ironing, board)","(walk, distance)","(santa, barbara)","(great, location)"
7,"(nice, hotel)","(e, g)","(great, place)","(santana, row)","(parking, lot)"
8,"(free, breakfast)","(elk, springs)","(parking, lot)","(pleasantly, surprised)","(friendly, staff)"
9,"(great, place)","(times, square)","(customer, service)","( , more)","(air, conditioner)"


### Trigram Comparison


In [143]:
trigramsCompare = pd.DataFrame([freq_tri, pmi_tri, t_tri, chi_tri, lik_tri]).T

trigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

trigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(front, desk, staff)","(elk, springs, resort)","(front, desk, staff)","(elk, springs, resort)","(front, desk, clerk)"
1,"(non, smoking, room)","(zion, national, park)","(non, smoking, room)","(flat, screen, tv)","(front, desk, staff)"
2,"(holiday, inn, express)","(flat, screen, tv)","(holiday, inn, express)","(holiday, inn, express)","(front, desk, person)"
3,"(front, desk, clerk)","(hard, boil, egg)","(front, desk, clerk)","(zion, national, park)","(front, desk, attendant)"
4,"(flat, screen, tv)","(holiday, inn, express)","(flat, screen, tv)","(within, walk, distance)","(front, desk, personnel)"
5,"(smell, like, smoke)","(red, roof, inn)","(smell, like, smoke)","(red, roof, inn)","(call, front, desk)"
6,"(old, town, alexandria)","(simpson, house, inn)","(old, town, alexandria)","(free, wi, fi)","(front, desk, people)"
7,"(front, desk, person)","(free, wi, fi)","(front, desk, person)","(hard, boil, egg)","(front, desk, guy)"
8,"(free, wi, fi)","(slide, glass, door)","(free, wi, fi)","(simpson, house, inn)","(front, desk, lady)"
9,"(great, customer, service)","(air, conditioning, unit)","(great, customer, service)","(slide, glass, door)","(hotel, front, desk)"


We can see that PMI and chi-square methods give pretty good results even without applying filters. Their results are also quite similar. Frequency and T-test methods are also similar to each other. In real applications, we can eyeball the list and set a threshold at a value from when the list stops making sense. We can also do different tests to see which list seems to make the most sense for a given dataset. Alternatively, we can combine results from multiple lists. Personally, I find it effective to multiply PMI and frequency to take into account both probability lift and frequency of occurrence.