In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

spacyModel = spacy.load('en_core_web_sm')

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [2]:
sentences = pd.read_csv('amazon_alexa.tsv', sep='\t')['verified_reviews']
sentences.sample(5)

1799                                           So far we have enjoyed it.  The only problem we have is I have not been able to call one of my sons with it (the sad part is he is the one who gave it to me:( )
2091    This is a great tool (toy?), room-filling music, very convenient to use, just fun to have. Why only 4 stars? Supposed to work with Logitech Harmony remote: nope. Supposed to work with Denon AV rec...
3021                                           LOVE it. I got this for my boyfriend for his birthday and his kids have a blast asking Alexa all kinds of questions and having her tell silly jokes. We love it.
114                                                       Still learning all the ways to use it.  It is a real hit with my grandkids who ask it to play music and then dance to it.  (5 and 7 yrs).  Very cute.
56                                                                                                                                    Works as advertised. Very easy to 

In [7]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in spacyModel(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [8]:
entity_pairs = []

for i in tqdm(sentences):
  entity_pairs.append(get_entities(i))

100%|██████████████████████████████████████| 3150/3150 [00:21<00:00, 144.27it/s]


In [20]:
def get_relation(sent):

  doc = spacyModel(sent)

  # Matcher class object 
  matcher = Matcher(spacyModel.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", [pattern]) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [21]:
relations = [get_relation(i) for i in tqdm(sentences)]

  0%|                                                  | 0/3150 [00:00<?, ?it/s]

ValueError: [E178] Each pattern should be a list of dicts, but got: {'DEP': 'ROOT'}. Maybe you accidentally passed a single pattern to Matcher.add instead of a list of patterns? If you only want to add one pattern, make sure to wrap it in a list. For example: `matcher.add('matching_1', [pattern])`

In [None]:
# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

In [None]:
# create a directed-graph from a dataframe
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))

pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()