## In this notebook

We implement pointwise mutual information based n-grams extraction to pick out key terms and phrases from reviews

**TODO**: We need more data to test this out correctly

In [1]:
import pandas as pd

In [3]:
import nltk

### load the data

In [8]:
df = pd.read_excel("../data/test_data.xlsx", index_col=0)

In [9]:
df.head()

Unnamed: 0,title,review,review_date,will_recommend
0,Simply the best,Outstanding picture color and brightness. I ch...,"April 8, 2022",Yes
1,"65"" Bravia XR A90J Smart TV & JBL 501 Soundbar","Excellent installation job of my 65"" Sony XR A...","October 30, 2021",Yes
2,The Best of All,Best color of any that TV I have see. Even the...,"December 12, 2021",Yes
3,"A JAW DROPPING, STUNNING MASTERPIECE!",I've owned several oleds from LG and Sony and ...,"May 11, 2021",Yes
4,Absolutely AMAZING!,Ive been an Oled fan since they became availab...,"April 22, 2022",Yes


In [10]:
df['contents'] = df['title'] + '.\n\n' + df['review']

In [12]:
df.head()

Unnamed: 0,title,review,review_date,will_recommend,contents
0,Simply the best,Outstanding picture color and brightness. I ch...,"April 8, 2022",Yes,Simply the best.\n\nOutstanding picture color ...
1,"65"" Bravia XR A90J Smart TV & JBL 501 Soundbar","Excellent installation job of my 65"" Sony XR A...","October 30, 2021",Yes,"65"" Bravia XR A90J Smart TV & JBL 501 Soundbar..."
2,The Best of All,Best color of any that TV I have see. Even the...,"December 12, 2021",Yes,The Best of All.\n\nBest color of any that TV ...
3,"A JAW DROPPING, STUNNING MASTERPIECE!",I've owned several oleds from LG and Sony and ...,"May 11, 2021",Yes,"A JAW DROPPING, STUNNING MASTERPIECE!.\n\nI've..."
4,Absolutely AMAZING!,Ive been an Oled fan since they became availab...,"April 22, 2022",Yes,Absolutely AMAZING!.\n\nIve been an Oled fan s...


### tokenization

In [18]:
import spacy

In [19]:
nlp = spacy.load('en_core_web_sm')

In [57]:
df['review_tokens'] = df['contents'].apply(lambda x: [t.text.lower() for t in nlp(x)])

In [58]:
df['review_tokens'].isna().sum()

0

### Bigram collocations for PMI

In [59]:
from nltk import BigramCollocationFinder, TrigramCollocationFinder
from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures

In [60]:
corpus = df['review_tokens'].sum()

In [61]:
len(corpus)

35811

In [92]:
bigram_measures = BigramAssocMeasures()
bigrams = BigramCollocationFinder.from_words(corpus)
scored_bigrams = bigrams.score_ngrams(bigram_measures.mi_like)

In [93]:
trigram_measures = TrigramAssocMeasures()
trigrams = TrigramCollocationFinder.from_words(corpus)
scored_trigrams = trigrams.score_ngrams(trigram_measures.mi_like)

In [94]:
ngram_df = pd.DataFrame({
    'phrase': [' '.join(p) for p, score in (scored_bigrams + scored_trigrams)],
    'score': [score for p, score in (scored_bigrams + scored_trigrams)]
})

In [95]:
ngram_df

Unnamed: 0,phrase,score
0,. \n\n,5.239973e+01
1,4 k,2.982353e+01
2,.,2.456458e+01
3,dolby vision,1.581028e+01
4,picture quality,1.331303e+01
...,...,...
47821,to . the,4.064620e-10
47822,. is the,3.531979e-10
47823,. and the,3.165491e-10
47824,. the the,1.555258e-10


### filtering ngrams with stopwords and punctuations

remove ngrams with any punctuations

trying to filter:
- ngrams with any stopwords
- ngrams with >1 stopwords 

and then comparing results

In [96]:
def is_alpha_numeric(doc):
    # checking if every token in the doc is not a string of punctuations
    if all([t.isalnum() for t in doc.split()]):
        return True
    return False
    

In [97]:
ngram_no_punct_df = ngram_df[ngram_df['phrase'].apply(lambda p: is_alpha_numeric(p))]

In [98]:
ngram_no_punct_df.shape

(34750, 2)

In [99]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [100]:
def contains_stopword(doc):
    if any([t in spacy_stopwords for t in doc.split()]):
        return True
    return False

def contains_more_than_n_stopwords(doc, n=1):
    if sum([t in spacy_stopwords for t in doc.split()]) > n:
        return True
    return False

In [103]:
ngram_no_punct_df[ngram_no_punct_df['phrase'].apply(lambda p: not contains_stopword(p))].head(25)

Unnamed: 0,phrase,score
1,4 k,29.823529
3,dolby vision,15.810277
4,picture quality,13.313034
8,itv hub,7.363636
9,master series,7.062069
10,sound bar,6.828387
11,geek squad,6.0
15,bbc iplayer,4.524887
19,best buy,2.723178
25,weeks ago,2.414286


In [104]:
ngram_no_punct_df[ngram_no_punct_df['phrase'].apply(lambda p: not contains_more_than_n_stopwords(p))].head(25)

Unnamed: 0,phrase,score
1,4 k,29.823529
3,dolby vision,15.810277
4,picture quality,13.313034
5,this tv,12.732903
8,itv hub,7.363636
9,master series,7.062069
10,sound bar,6.828387
11,geek squad,6.0
14,the best,4.942757
15,bbc iplayer,4.524887


In [105]:
import plotly.express as px

In [110]:
px.histogram(ngram_no_punct_df[ngram_no_punct_df['phrase'].apply(lambda p: not contains_stopword(p))], 'score')