In [52]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import json

In [53]:
D = pd.read_csv('../data/yelp_example_1_small.tsv', sep='\t')

In [54]:
D.head(2)

Unnamed: 0,content,score,business,avgstars
0,This place is WAAAY over priced for the generi...,1,Lee's Buffet,2.0
1,Our taxi driver had told us to go to this plac...,5,Village Pub and Cafe,3.5


## Spacy

In [4]:
import spacy

In [5]:
nlp = spacy.load("en_core_web_sm")

In [33]:
text = D.loc[1200].content

In [34]:
print(text)

YESS!  

A legit,  24 hour, Pho spot in Vegas that is just a cab ride away from the strip!

Me and my friends came here at 4am after a night out and we were starving!  On a scale of 1 to 10 my level of excitement was 11.  Upon entering, one thing turned me off.  The smell!!  It smelled like they've used the same old, dirty, moldy, rag to wipe down the tables for 100000 years without washing it. The whole restaurant had an old wet towel smell to it.  Regardless, I was still excited to eat. 

I ordered the Rare Steak and Brisket Pho  (http:\/\/www.yelp.com\/biz_photos\/pho-kim-long-las-vegas?select=9hI7mkZ4PHU7SjC5hxZqLA#Q1aVULe4TLE41w5R-r7gWg).  The broth was flavorful, refreshing, and not too oily.  The brisket and steak were perfectly cooked and tender.  They gave me a generous amount of noodles too!  Don't forget to ask for the side of sliced onions!  Maybe it's just  a Korean thing and we're trying to make ghetto-kimchi, haha.  (http:\/\/www.yelp.com\/biz_photos\/pho-kim-long-las-ve

In [35]:
for ent in nlp(text).ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

24 hour 19 26 TIME
Pho 28 31 GPE
Vegas 40 45 GPE
4am after a night 123 140 TIME
1 182 183 CARDINAL
10 187 189 CARDINAL
11 217 219 CARDINAL
one 237 240 CARDINAL
100000 years 365 377 DATE
the Rare Steak 506 520 WORK_OF_ART
Korean 885 891 NORP


## TextRazor

In [41]:
import textrazor

In [42]:
def topics_entities(doc_id, text, client):
    entities, topics = [], []
    try:
        response = client.analyze(text)
        for entity in response.entities():
            E = {'id': entity.id, 'relevance': entity.relevance_score, 
                 'confidence': entity.confidence_score, 
                 'dbpedia_types': entity.dbpedia_types, 
                 'wikipedia': entity.wikipedia_link,
                 'matched_text': entity.matched_text, 
                 'matched_words': entity.matched_words,
                 'doc_id': doc_id
                }
            entities.append(E)
        for topic in response.topics():
            T = {'wikidata': topic.wikidata_id, 
                 'wikipedia': topic.wikipedia_link, 
                 'score': topic.score, 'label': topic.label,
                 'doc_id': doc_id
                }
            topics.append(T)
    except:
        pass
    return entities, topics

In [48]:
with open('../data/keys.json', 'r') as infile:
    k = json.load(infile)

In [49]:
textrazor.api_key = k['razor']

client = textrazor.TextRazor(extractors=["entities", "topics"])

e, t = topics_entities(1, text, client)

In [55]:
t

[{'wikidata': 'Q420646',
  'wikipedia': 'http://en.wikipedia.org/Pho',
  'score': 1,
  'label': 'Pho',
  'doc_id': 1},
 {'wikidata': 'Q1778821',
  'wikipedia': 'http://en.wikipedia.org/Category:Cuisine',
  'score': 1,
  'label': 'Cuisine',
  'doc_id': 1},
 {'wikidata': None,
  'wikipedia': 'http://en.wikipedia.org/Category:Ancient_dishes',
  'score': 1,
  'label': 'Ancient dishes',
  'doc_id': 1},
 {'wikidata': 'Q10675206',
  'wikipedia': 'http://en.wikipedia.org/Category:Food_ingredients',
  'score': 1,
  'label': 'Food ingredients',
  'doc_id': 1},
 {'wikidata': 'Q192628',
  'wikipedia': 'http://en.wikipedia.org/Category:Beef',
  'score': 1,
  'label': 'Beef',
  'doc_id': 1},
 {'wikidata': 'Q1427844',
  'wikipedia': 'http://en.wikipedia.org/Category:Meat_dishes',
  'score': 1,
  'label': 'Meat dishes',
  'doc_id': 1},
 {'wikidata': None,
  'wikipedia': 'http://en.wikipedia.org/Category:Prepared_foods_by_main_ingredient',
  'score': 1,
  'label': 'Prepared foods',
  'doc_id': 1},
 {'w

## IBM watson

In [36]:
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_watson.natural_language_understanding_v1 import Features, EntitiesOptions
from ibm_watson.natural_language_understanding_v1 import ConceptsOptions, SentimentOptions
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

In [37]:
with open('../data/keys.json', 'r') as infile:
    k = json.load(infile)

In [38]:
authenticator = IAMAuthenticator(k['watson'])
service = NaturalLanguageUnderstandingV1(version='2018-03-16', authenticator=authenticator)
service.set_service_url('https://api.eu-gb.natural-language-understanding.watson.cloud.ibm.com/instances/b09de8e9-644f-45d5-9f53-65c51aed0524')

In [39]:
response = service.analyze(
            text=text, features=Features(
                entities=EntitiesOptions(sentiment=True),
                concepts=ConceptsOptions(),
                sentiment=SentimentOptions())).get_result()

In [40]:
response

{'usage': {'text_units': 1, 'text_characters': 1202, 'features': 3},
 'sentiment': {'document': {'score': 0.458612, 'label': 'positive'}},
 'language': 'en',
 'entities': [{'type': 'Location',
   'text': 'Vegas',
   'sentiment': {'score': 0.0, 'label': 'neutral'},
   'relevance': 0.790891,
   'disambiguation': {'subtype': ['City']},
   'count': 1},
  {'type': 'Quantity',
   'text': '100000 years',
   'sentiment': {'score': 0.0, 'label': 'neutral'},
   'relevance': 0.790891,
   'count': 1},
  {'type': 'Quantity',
   'text': '24 hour',
   'sentiment': {'score': 0.0, 'label': 'neutral'},
   'relevance': 0.790891,
   'count': 1}],
 'concepts': [{'text': 'Taste',
   'relevance': 0.942114,
   'dbpedia_resource': 'http://dbpedia.org/resource/Taste'},
  {'text': 'Water',
   'relevance': 0.922209,
   'dbpedia_resource': 'http://dbpedia.org/resource/Water'},
  {'text': 'Steak',
   'relevance': 0.808267,
   'dbpedia_resource': 'http://dbpedia.org/resource/Steak'},
  {'text': '2005 singles',
   'r