## **3.2 Extract Noun-Adj Pairs**

### **Download necessary libraries**

In [None]:
!python -m spacy download en_core_web_lg
!python -m spacy download en

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


### **Import necessary libraries**

In [None]:
# imports
import re
import random
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1000)

# spacy
import spacy
from spacy.lang.en import English # updated
from spacy import displacy

import en_core_web_lg
nlp = en_core_web_lg.load()

# nltk
import nltk
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

# tf-idf
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

from collections import Counter



Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### **Visualisation functions**

We utilise the function to visualise the dependency relation within the given example sentences

In [None]:
def display_dependency_relation(input_string):
  doc = nlp(input_string)
  print(doc[2].dep_)
  displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [None]:
# example 1
display_dependency_relation("Very pretty flower")

ROOT


In [None]:
# example 2
display_dependency_relation("It was a so so dish")

det


In [None]:
display_dependency_relation("The beef in the soup was not right")

prep


In [None]:
display_dependency_relation("This restaurant has horrible customer service")

ROOT


# **Functions**




## 1. Sample data according to star rating

In [None]:
from random import sample

random.seed(42)

def sample_data(stars, sample_size):

    print("Sampling {} reviews ".format(sample_size))
    rating_data = data[data['stars'] == stars]
    
    ## take one random sample from each business
    takesamp = lambda d: d.sample(1)
    sample_data = rating_data.groupby('business_id').apply(takesamp)
    
    ## sample 50 business reviews of the whole dataframe
    sample_data= sample_data.sample(n=sample_size)
    
    return sample_data

## 2. Split reviews into segments and sentences
In order to better handle the large amount of data and the long reviews, we split up the data into manageable sentences and segments

In [None]:
def split_reviews(df):

    data=pd.DataFrame()

    sent_splitter = English()
    sent_splitter.add_pipe(sent_splitter.create_pipe('sentencizer')) # updated

    # splitting sentence into segments according to punctuation
    punc_split = re.compile(',|!')

    for index, row in df.iterrows():
        rating = row['stars']
        review = row['text']
        doc = sent_splitter(review)
        sentences = [sent.text.strip() for sent in doc.sents]
        for sentence in sentences:
            segments = punc_split.split(sentence)
            segments = [segment.strip() for segment in segments if segment.strip()]

            for segment in segments:
                data = data.append({
                    "review_idx": index,
                    "rating": rating,
                    "review": review,
                    "sentence": sentence,
                    "segment": segment,
                }, ignore_index=True)  
                
                
    return data


## 3. Get Noun-Adj Pairs from a Segment

###### There are two concepts surrounding noun-adjective pairs, a noun can either be a singular word (eg. food) or a combination of words (eg. quality of service). We tackle the two cases differently.

###### A combination of POS tagging and dependency relation is being utilised to filter out the noun-adjective pairs.

In [None]:
def get_pairs(document):
    document = nlp(document)
    
    # Get list of nouns in doc
    nouns = [token for token in document if token.dep_ in ['nsubj']]  
    
    # Remove nouns with POS tags that are pronoun or determinant
    nouns = [token for token in nouns if token.pos_ not in  ('PRON', 'DET') ]
    pair_list = []

    if len(nouns) != 0:
        for token in nouns:
            noun = document[token.i: token.i+1] 
          
            ## get the adjectives on the right of the token
            right_adj_list = [adj for adj in noun.root.head.rights if adj.dep_ in ['amod', 'acomp']]

            ## get the adjectives on the left of the token by iterate the left and right of the noun root
            left_adj_list = [adj for adj in noun.root.head.lefts if adj.dep_ in ['amod', 'acomp']]
            adj_list = left_adj_list + right_adj_list
            if len(adj_list) != 0:
                pair_item_noun = noun
                pair_item_adj = adj_list[0]

                ## create a noun-adj string pair                
                string_pair = str(pair_item_noun).lower() + "-" + str(pair_item_adj).lower()
                pair_list.append(string_pair)


    return pair_list

In [None]:
def get_compound_pairs(document):
    document = nlp(document)
    

    # Get list of compound nouns in doc
    compound_nouns = [token for token in document if token.dep_ == 'compound'] 

    # prevent compound-compound noun pairs                    
    compound_nouns = [c for c in compound_nouns if c.i == 0 or document[c.i - 1].dep_ != 'compound'] 

    noun_adj_pairs = []

    if len(compound_nouns) != 0: 
        for token in compound_nouns:
          compound_noun_item, adj_item = False, False 
          
          # assume compound noun token is before the nsubj noun token 
          compound_noun = document[token.i: token.head.i + 1] 

          # find adj for this noun
          noun = document[token.head.i : token.head.i+1] 

          left_adjective_list = [adj for adj in noun.root.head.lefts if adj.dep_ in ['amod', 'acomp']]
          right_adjective_list = [adj for adj in noun.root.head.rights if adj.dep_ in [ 'amod', 'acomp']]
          adjective_list = left_adjective_list + right_adjective_list
          

          if len(adjective_list) != 0:
              compound_noun_item = compound_noun
              adj_item = adjective_list[0]

              ## create a compound-compound-adj string pair                
              string_pair = str(compound_noun_item).lower() + "-" + str(adj_item).lower()
              noun_adj_pairs.append(string_pair)


    return noun_adj_pairs

## 4. Get Frequent Noun-Adj Pairs of Reviews of X Star Rating


In [None]:
def get_frequent_pairs(star, sample_size = 20):

    if(star == 1.0):
        sample_size = 50
    
    
    ## get sample data
    filtered_data = sample_data(star, sample_size)
    
    ## split reviews into segments for easier analysis
    new_data = split_reviews(filtered_data)
    
    ## find adj-noun pairs
    new_data["singular_pairs"] = new_data['segment'].apply(get_pairs)
    new_data["compound_pairs"] = new_data['segment'].apply(get_compound_pairs)

    ## filter rows with detected pairs
    new_data = new_data[(new_data['singular_pairs'].map(len) > 0) | (new_data['compound_pairs'].map(len) > 0)]

    noun_adj_pair = []
    for idx, row in new_data.iterrows():
      if(len(row['compound_pairs']) > 0 and len(row['singular_pairs']) > 0):
        noun_adj_pair.append(row['compound_pairs'])
      elif (len(row['singular_pairs']) > 0):
        noun_adj_pair.append(row['singular_pairs'])
      elif (len(row['compound_pairs']) > 0):
        noun_adj_pair.append( row['compound_pairs'])

    new_data['noun_adj_pair'] =noun_adj_pair
  
    ## get unique pairs
    global_list =[]
    for pairs in new_data['noun_adj_pair']:
        for pair in pairs:
            global_list.append(pair)
            

    unique_pair_dict = dict(Counter(global_list).items())
    idx_list = len(unique_pair_dict)

    pair_df = pd.DataFrame.from_dict(unique_pair_dict, orient='index', columns=['Frequency']).sort_values(by=['Frequency'], ascending=False)
    
    return pair_df





## Data Preparation 

In [None]:

json_file_path = "/content/reviewSelected100 (1).json"

data = pd.read_json(json_file_path)
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,"We had my Mother's Birthday Party here on 10/29/16. What a Great time we all had. The food, music and waiters were Great!!! Thanks Lyles!!!",2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,"Good Korean grill near Eaton Centre. The marinate is good. We got beef, ox liver, salmon, fish fillet, chicken, pork, pork belly. The fish fillet was bland and liver was meh. Salmon and chicken was really flavourable. Such a fun place to eat at for a date or group of friends. Even alone. No judgments here. \nThe staff is attentive, nice and considerate. Bigger groups will most likely be seated on the second floor which is way bigger.\nCaution: will smell like BBQ grill after.",2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,"Was recommended to try this place by few people and today was my first time here. All I can say is, I am coming back very soon.\n\nSERVICE\nWasn't sure if the guy was the owner but he was friendly and talked story while we waited for our food. Loved it!! Food came out within 10 min. \n\nFOOD\nTried hamburger steak and it was so delicious. Gravy/sauce they put on the hamburger steak was perfect! Also came with onion rings on top which I love. Chicken katsu was amazing! Chicken katsu here is crunchy and surprisingly has a flavor by itself that you really don't need a sauce for it. Best chicken katsu I had. \n\nOVERALL\nIt was a journey to get to this place as it took about 30min from my house but the service and food here made it worth the drive. I also love how they had a poster of Keali'i Reichel. (They had other posters but Keali'i Reichel happens to be my favorite). Place is clean, service is fast and friendly and food is delicious. What more could you ask for?",2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,"Ambience: Would not expect something this nice at Cannery Hotel but it is the nicest looking restaurant there. More for couples than group gatherings.\n\nService: The ambience & food make up for this, which unfortunately for us, the service has been terrible. We have come fairly close to restaurant closing both times (within the hour), but they do close very early for Vegas. The staff makes it VERY clear that they want to go home right from the start in hurrying orders and are more aggressive as time goes on. Unfortunate.\n\nFood: Very good. A little salty on some items during our first visit but good overall and again, warrants the overall 3 stars. Steak. Scallops wrapped in bacon. Calamari. Cobb salad. etc.",2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,"Absolutely the WORST pool company that I have EVER had to deal with. The customer service is horrible. After leaving many messages over the course of a few weeks I was only able to contact them when I called them AGAIN. I asked to speak with the actual pool tech who initially came to my house. The RUDE lady on the phone told me that she was more than capable to answer my questions - about a pump that SHE HAS NOT SEEN, and about a conversation I had with the tech THAT SHE DID NOT HEAR. \n\nI was assigned to them by my home warranty company, and I will be filing a serious complaint with them and the BBB. I was told to take the cash out option from the warranty company for the part and then they would do the work and I could just pay them directly. After I received the cash out and called to schedule the appointment I was told that I need to replace the entire pool pump system and that would cost an additional $400 and that there was an electrical problem and that it would cost additional money as well. \n\nI was then told that the salt cell that I need is no longer on the market, even though I was able to find them on Amazon. This is the reason that I am STILL trying to discuss my options with the tech who ACTUALLY SAW MY POOL AND THE PUMP. \n\nIf you don't want to be scammed or jacked around with USE ANOTHER POOL COMPANY.",2016-04-11 18:49:11


## Clean data

In [None]:
def clean(x):
    new_string = re.sub(r'\n|\s+', ' ', x)
    return new_string

In [None]:
def word_count(x):
    cnt = len(re.findall(r'\w+', x))
    return cnt
    

In [None]:
data['text'] = data['text'].apply(clean)
data['word count'] = data['text'].apply(word_count)


In [None]:
data['word count'].value_counts

<bound method IndexOpsMixin.value_counts of 0        28 
1        85 
2        190
3        122
4        264
        ... 
15295    52 
15296    52 
15297    618
15298    27 
15299    70 
Name: word count, Length: 15300, dtype: int64>

In [None]:
word_count_df = pd.DataFrame(data['word count'].value_counts())
word_count_df.reset_index(inplace=True)
word_count_df.rename(columns = {'index': 'word length', 'word count':'count'}, inplace=True)

In [None]:
import plotly.express as px
fig = px.bar(word_count_df, x='word length', y='count')
fig.show()

### Compound Vs Singular Pairs

In [None]:
star = 1.0
## get sample data
filtered_data = sample_data(star, 50)

## split reviews into segments for easier analysis
new_data = split_reviews(filtered_data)

## find adj-noun pairs
new_data["singular_pairs"] = new_data['segment'].apply(get_pairs)
new_data["compound_pairs"] = new_data['segment'].apply(get_compound_pairs)

## filter segments with detected noun-adj pairs 
new_data = new_data[(new_data['singular_pairs'].map(len) > 0) | (new_data['compound_pairs'].map(len) > 0)]

new_data.shape

Sampling 50 reviews 


(65, 7)

In [None]:
new_data[['sentence', 'singular_pairs', 'compound_pairs']]

Unnamed: 0,sentence,singular_pairs,compound_pairs
5,"The gazpacho was good but not extraordinary and to my surprise, the waiter didn't even know what kind of olives I was being served.",[gazpacho-good],[]
7,The sangria was good but... honestly... my glass was more ICE than it was DRINK.,[sangria-good],[]
31,"I asked if most people get regular polish, and I was told yes, that regular polish would be just fine.",[polish-fine],[]
49,The girl at the front desk was extremely sweet.,[girl-sweet],[]
50,The d�cor and atmosphere was decent.,[cor-decent],[�cor-decent]
60,"This place is overpriced, management is super unfriendly, not happy about the food either.",[place-overpriced],[]
61,"This place is overpriced, management is super unfriendly, not happy about the food either.",[management-unfriendly],[]
69,The service was rude.,[service-rude],[]
79,And then when I arrived at the new time made to wait until I was told the Nurse Practitioner was too busy to see me.,[practitioner-busy],[nurse practitioner-busy]
82,I am still a bit shocked by how unprofessional their office procedure was.,[procedure-unprofessional],[office procedure-unprofessional]


In [None]:

segments_w_both_pairs_data = new_data[(new_data['singular_pairs'].map(len) > 0) & (new_data['compound_pairs'].map(len) > 0)]
segments_w_both_pairs_data[['sentence', 'singular_pairs', 'compound_pairs']]

Unnamed: 0,sentence,singular_pairs,compound_pairs
50,The d�cor and atmosphere was decent.,[cor-decent],[�cor-decent]
79,And then when I arrived at the new time made to wait until I was told the Nurse Practitioner was too busy to see me.,[practitioner-busy],[nurse practitioner-busy]
82,I am still a bit shocked by how unprofessional their office procedure was.,[procedure-unprofessional],[office procedure-unprofessional]
215,The grilled meat platter was suspect.,[platter-suspect],[meat platter-suspect]
285,Food quality is NOT good.,[quality-good],[food quality-good]


Comparing singular and compound pairs, compound pairs are able to extract a more detailed noun such as bubble tea instead of tea.

In [None]:
noun_adj_pair = []
for idx, row in new_data.iterrows():
  if(len(row['compound_pairs']) > 0 and len(row['singular_pairs']) > 0):
    noun_adj_pair.append(row['compound_pairs'])
  elif (len(row['singular_pairs']) > 0):
    noun_adj_pair.append(row['singular_pairs'])
  elif (len(row['compound_pairs']) > 0):
    noun_adj_pair.append( row['compound_pairs'])

new_data['noun_adj_pair'] =noun_adj_pair



# **Results**

## **Frequent Noun-adj Pairs of Reviews from 1-5 Stars Rating**

---




In [None]:
results_df = pd.DataFrame()
for star in range(1,6):
  print("Calculating top 10 noun-adj pairs for reviews with {} star rating ...... ".format(star))
  df_per_star= get_frequent_pairs(star)
  col_name = " {} Star Reviews".format(star)
  df_per_star = df_per_star.head(10).rename_axis(col_name).reset_index()
  results_df = pd.concat([results_df,df_per_star], axis = 1)

Calculating top 10 noun-adj pairs for reviews with 1 star rating ...... 
Sampling 50 reviews 
Calculating top 10 noun-adj pairs for reviews with 2 star rating ...... 
Sampling 20 reviews 
Calculating top 10 noun-adj pairs for reviews with 3 star rating ...... 
Sampling 20 reviews 
Calculating top 10 noun-adj pairs for reviews with 4 star rating ...... 
Sampling 20 reviews 
Calculating top 10 noun-adj pairs for reviews with 5 star rating ...... 
Sampling 20 reviews 


In [None]:
results_df

Unnamed: 0,1 Star Reviews,Frequency,2 Star Reviews,Frequency.1,3 Star Reviews,Frequency.2,4 Star Reviews,Frequency.3,5 Star Reviews,Frequency.4
0,donut-hard,1,place-clean,2,beer selection-fine,1,place-clean,2,food-amazing,3
1,meat-fatty,1,place-busy,1,seafood pancake-good,1,place-tasty,1,food-good,2
2,attitude-horrendous,1,saturday mornings-busy,1,dish-amazing,1,fries-chunky,1,service-great,1
3,money-better,1,beer selection-good,1,scallops-great,1,employees-friendly,1,atmosphere-cool,1
4,waitress-nonexistent,1,barmaid-busy,1,waitstaff-friendly,1,ambiance-festive,1,stay-longer,1
5,things-batter,1,bartender-busy,1,location-professional,1,cheat day-favorite,1,pool-open,1
6,pho-okay,1,server-nice,1,sandwich-ok,1,broth-awesome,1,staff-nice,1
7,folks-able,1,place-alright,1,food-ok,1,service-nice,1,property-quiet,1
8,wait-long,1,namesake-sure,1,chicken-flavorless,1,atmosphere-nice,1,driver-nice,1
9,credit card-unsanitary,1,perk-cup-empty,1,sauce-spicy,1,service-friendly,1,kitchenette - mini-mini,1


-------------------------------------------------------------------------------------------------------------END--------------------------------------------------------------------------------------------------------------