In [2]:
%pip install cohere

Collecting cohere
  Using cached cohere-1.3.9.tar.gz (8.9 kB)
Building wheels for collected packages: cohere
  Building wheel for cohere (setup.py): started
  Building wheel for cohere (setup.py): finished with status 'done'
  Created wheel for cohere: filename=cohere-1.3.9-cp38-cp38-win_amd64.whl size=9459 sha256=5f4925dea6c685683738bec81d52741b948ffaa61291602b8844c8672dde5c6c
  Stored in directory: c:\users\acer\appdata\local\pip\cache\wheels\ff\e7\08\7262268671cab26185389b733777907b447bba38db4232196c
Successfully built cohere
Installing collected packages: cohere
Successfully installed cohere-1.3.9
Note: you may need to restart the kernel to use updated packages.


In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cohere
from cohere.classify import Example
import json

LIMIT = 10000

In [49]:
def load_data(filepath, nrows=None):
    with open(filepath, encoding="utf8") as jsonfile:
        count = 0
        items = []
        line = jsonfile.readline()
        while (nrows is None or count < nrows) and line:
            count += 1
            obj = json.loads(line)
            items.append(obj)
            line = jsonfile.readline()
        return pd.DataFrame(items)

reviews = load_data('Dataset/yelp_academic_dataset_review.json', LIMIT)
print('Review dataset size = {}'.format(reviews.shape[0]))

business = load_data('../Dataset/yelp_academic_dataset_business.json')
print('Business dataset size = {}'.format(business.shape[0]))

Review dataset size = 10000


In [50]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4.0,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4.0,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5.0,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2.0,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4.0,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01


In [51]:
reviews.shape

(10000, 9)

In [52]:
reviews.isnull().sum()

review_id      0
user_id        0
business_id    0
stars          0
useful         0
funny          0
cool           0
text           0
date           0
dtype: int64

In [53]:
review_1 = reviews[['business_id', 'stars', 'text']]
review_2 = review_1.reset_index(drop=True)

In [54]:
review_2.head(50)

Unnamed: 0,business_id,stars,text
0,buF9druCkbuXLX526sGELQ,4.0,Apparently Prides Osteria had a rough summer a...
1,RA4V8pr014UyUbDvI-LW2A,4.0,This store is pretty good. Not as great as Wal...
2,_sS2LBIGNT5NQb6PD1Vtjw,5.0,I called WVM on the recommendation of a couple...
3,0AzLzHfOJgL7ROwhdww2ew,2.0,I've stayed at many Marriott and Renaissance M...
4,8zehGz9jnxPqXtOc7KaJxA,4.0,The food is always great here. The service fro...
5,xGXzsc-hzam-VArK6eTvtw,1.0,"This place used to be a cool, chill place. Now..."
6,EXOsmAB1s71WePlQk0WZrA,2.0,"The setting is perfectly adequate, and the foo..."
7,DbXHNl890xSXNiyRczLWAg,5.0,Probably one of the better breakfast sandwiche...
8,mD-A9KOWADXvfrZfwDs-jw,4.0,I am definitely a fan of Sports Authority. Th...
9,EEHhKSxUvJkoPSzeGKkpVg,5.0,I work in the Pru and this is the most afforda...


In [55]:
import nltk
import re
from nltk import pos_tag, pos_tag_sents
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import sent_tokenize, word_tokenize, pos_tag

In [56]:
# 1. The text data in review dataset is cleaned and tokenized to sentences and then words
# 2. Tag all words with position tags and convert to wordnet tags
# 3. Lemmatize the words and create synsets
# 4. Compute sentiment score (positive score - negative score)
# 5. Return a sentiment polarity score: 1 = positive, 0 = negative

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')
 
lemmatizer = WordNetLemmatizer()
  
def convert_tag(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None
 
def clean_text(text):
    text = text.replace("<br />", " ")
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    #text = text.decode("utf-8")
    return text

def compute_swn_polarity_score(text):
    sentiment_score = 0.0
    num_tokens = 0
    text = clean_text(text)
    sentences = sent_tokenize(text)
    for sentence in sentences:
        tagged_sentence = pos_tag(word_tokenize(sentence))
        for word, tag in tagged_sentence:
            wn_tag = convert_tag(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment_score += swn_synset.pos_score() - swn_synset.neg_score()
            num_tokens += 1
    # sum greater than 0 => positive sentiment
    if sentiment_score >= 0:
        return 1
    # All other scenarios => negative sentiment
    else:
        return 0

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [57]:
# Create a new column 'polarity' with sentiment polarity as 1 (positive) or 0 (negative)
review_2['polarity'] = review_2['text'].apply(compute_swn_polarity_score)
review_2.head(50)

Unnamed: 0,business_id,stars,text,polarity
0,buF9druCkbuXLX526sGELQ,4.0,Apparently Prides Osteria had a rough summer a...,1
1,RA4V8pr014UyUbDvI-LW2A,4.0,This store is pretty good. Not as great as Wal...,1
2,_sS2LBIGNT5NQb6PD1Vtjw,5.0,I called WVM on the recommendation of a couple...,1
3,0AzLzHfOJgL7ROwhdww2ew,2.0,I've stayed at many Marriott and Renaissance M...,1
4,8zehGz9jnxPqXtOc7KaJxA,4.0,The food is always great here. The service fro...,1
5,xGXzsc-hzam-VArK6eTvtw,1.0,"This place used to be a cool, chill place. Now...",0
6,EXOsmAB1s71WePlQk0WZrA,2.0,"The setting is perfectly adequate, and the foo...",1
7,DbXHNl890xSXNiyRczLWAg,5.0,Probably one of the better breakfast sandwiche...,1
8,mD-A9KOWADXvfrZfwDs-jw,4.0,I am definitely a fan of Sports Authority. Th...,1
9,EEHhKSxUvJkoPSzeGKkpVg,5.0,I work in the Pru and this is the most afforda...,1


In [58]:
def transform_review(review_2):
    review_3 = review_2.reset_index()
    review_3['stars'] = review_3['stars'].apply(lambda x: round(x,2))
    review_3['polarity'] = review_3['polarity'].apply(lambda x: round(x,2))
    review_3.rename(columns={'polarity':'net_positive_sentiment_score'}, inplace=True)
    review_3.rename(columns={'stars':'review_stars'}, inplace=True)
    return review_3
review_3 = transform_review(review_2)
review_3.head(50)

Unnamed: 0,index,business_id,review_stars,text,net_positive_sentiment_score
0,0,buF9druCkbuXLX526sGELQ,4.0,Apparently Prides Osteria had a rough summer a...,1
1,1,RA4V8pr014UyUbDvI-LW2A,4.0,This store is pretty good. Not as great as Wal...,1
2,2,_sS2LBIGNT5NQb6PD1Vtjw,5.0,I called WVM on the recommendation of a couple...,1
3,3,0AzLzHfOJgL7ROwhdww2ew,2.0,I've stayed at many Marriott and Renaissance M...,1
4,4,8zehGz9jnxPqXtOc7KaJxA,4.0,The food is always great here. The service fro...,1
5,5,xGXzsc-hzam-VArK6eTvtw,1.0,"This place used to be a cool, chill place. Now...",0
6,6,EXOsmAB1s71WePlQk0WZrA,2.0,"The setting is perfectly adequate, and the foo...",1
7,7,DbXHNl890xSXNiyRczLWAg,5.0,Probably one of the better breakfast sandwiche...,1
8,8,mD-A9KOWADXvfrZfwDs-jw,4.0,I am definitely a fan of Sports Authority. Th...,1
9,9,EEHhKSxUvJkoPSzeGKkpVg,5.0,I work in the Pru and this is the most afforda...,1


In [28]:
review_2.count()

text        10000
polarity    10000
dtype: int64

In [43]:
review_sample = review_2.sample(10000)

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(
    list(review_sample['text']), list(review_sample['polarity']), test_size=0.25, random_state=42)

In [38]:
from sklearn import tree

In [39]:
clf = tree.DecisionTreeClassifier(max_depth = 5) 
clf.fit(sentences_train, labels_train)

ValueError: could not convert string to float: 'On my recent visit to Boston, I heard no less than three people raaave about Parish Cafe\'s gourmet sandwiches, invented by the chefs of some of the top restaurants in Beantown. Since I always have a place in my heart for a good sandwich, I had to check it out. \n\nThe outdoor seating area is obviously quite popular, as suggested by the sign near the entrance suggesting that anyone who wants to sit indoors does not need to stand in queue. You can\'t tell from the outside, but there is plenty of seating inside (though mostly for smaller parties) and the place has a somewhat pub-like feel. I took a seat at the bar, contemplated one of their many beers, and decided to ask for a suggestion from the bartender/server on what sandwich to try.\n\n"Well, it depends on what you\'re feeling like." And she was right, there is a type of sandwich for almost any type of appetite. Hearty meatloaf? The indulgent daily special of beer-battered, deep-fried mussels? Since I didn\'t feel like spending the rest of my day fighting a food coma, I settled on the "Blue Ginger" sandwich (created by Ming Tsai) - a grilled yet perfectly rare (and it is RED inside) tuna steak that is brushed with a teriyaki glaze. The condiments (lettuce, tomato, avocado slices, wasabi mayo) were the right combination, but the wasabi mayo wasn\'t strong enough to leave an impression or contrast. Overall, it was quite fresh, delicious, and well-seasoned...BUT (always a but w/ me, sorry that\'s how I roll) I think that focaccia is not structurally appropriate to hold together this surprisingly girthy sandwich. The bread, while perfectly toasted, herbed, and not too oily, inevitably gets weighed down and soggy from the fillings...leaving you with more sandwich on your hands than in your mouth. I took took to eating the rest of this with a fork and knife, which sorta defeats the whole point of it being a sandwich, doncha think?\n\nIn my opinion, the Asian slaw served w/ the Blue Ginger was sort of an afterthought. Although it had a nice sesame flavor, there was a missing acidic component, and the square-cut napa cabbage just looked sloppy (a thin julienne would have been much more appetizing to the eye).\n\nAfter I was done with my sandwich and fully satisfied, I was already thinking of what I wanted to try the next time I am in town.. But one question lingered: if the concept is to get the best chefs in Boston to create the menu...then why are so many of the sandwiches made by the Parish Cafe chefs???'

In [40]:
score = clf.score(sentences_test, labels_test)
print(f"Validation accuracy on medium is {100*score}%!")


NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.