In [None]:
import numpy as np
import json
import pandas as pd

def load_yelp_review(path='yelp_academic_dataset_review.json'):
    reviews = {}
    with open(path, "r", encoding="utf-8") as file:
        for line in file:
            jsonline = json.loads(line)
            reviews[jsonline['text']] = jsonline['stars']
    return reviews

def load_yelp_business(path='yelp_academic_dataset_business.json'):
    business_stars = {}
    with open(path, "r", encoding="utf-8") as file:
        for line in file:
            jsonline = json.loads(line)
            business_id = jsonline['business_id']
            business_stars[business_id] = jsonline['stars']
    return business_stars

In [None]:
reviews_stars = load_yelp_review()

In [None]:
business_stars = load_yelp_business()

In [None]:
!pip install rarfile
!pip install unrar

In [None]:
#http://www.developintelligence.com/blog/2017/03/predicting-yelp-star-ratings-review-text-python/
from collections import Counter
def balance_classes(xs, ys):
    freqs = Counter(ys)
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [None]:
#reduce size because of memory issues
balanced_x, balanced_y = balance_classes(list(reviews_stars.keys())[:10000], list(reviews_stars.values())[:10000])

In [None]:
Counter(balanced_y)

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidVectorizer = TfidfVectorizer(ngram_range=(1,2) ,stop_words=nltk.corpus.stopwords.words('english'))
vectors = tfidVectorizer.fit_transform(balanced_x)

In [None]:
#prediction

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.33, random_state=42)

# initialise the SVM classifier
classifier = LinearSVC()
classifier.fit(X_train, y_train)
preds = classifier.predict(X_test)
svc_score = accuracy_score(y_test, preds)

multinomialNBclassifier = MultinomialNB(alpha=0.3)
multinomialNBclassifier.fit(X_train, y_train)
preds = multinomialNBclassifier.predict(X_test)

nb_score = accuracy_score(y_test, preds)

In [None]:
#Stemming
from nltk.stem.porter import *

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

tfidVectorizerstemmed = TfidfVectorizer(ngram_range=(1,2),tokenizer=tokenize ,stop_words=nltk.corpus.stopwords.words('english'))
vectorsstemmed = tfidVectorizerstemmed.fit_transform(balanced_x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vectorsstemmed, balanced_y, test_size=0.33, random_state=42)

multinomialNBclassifier = MultinomialNB(alpha=0.3)
multinomialNBclassifier.fit(X_train, y_train)
preds = multinomialNBclassifier.predict(X_test)

nb_score_stemmed = accuracy_score(y_test, preds)

In [None]:
#Pos Tagging using adjectives
def pos_tokens(tokens):
    tagged_items = nltk.pos_tag(tokens)
    adjective_tokens = []
    for item in tagged_items:
        if item[1] == 'JJ':
            adjective_tokens.append(item[0])
    return adjective_tokens

def pos_tokenize(text):
    tokens = nltk.word_tokenize(text)
    adjective_tokens = pos_tokens(tokens)
    return adjective_tokens

tfidVectorizeradj = TfidfVectorizer(ngram_range=(1,2),tokenizer=pos_tokenize ,stop_words=nltk.corpus.stopwords.words('english'))
vectorsadj = tfidVectorizeradj.fit_transform(balanced_x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vectorsadj, balanced_y, test_size=0.33, random_state=42)

multinomialNBclassifier = MultinomialNB(alpha=0.3)
multinomialNBclassifier.fit(X_train, y_train)
preds = multinomialNBclassifier.predict(X_test)

nb_score_pos = accuracy_score(y_test, preds)

In [None]:
#install unrar on o/s: sudo apt install unrar
dl_url = 'http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar'
local_name = 'opinion-lexicon-English.rar'

# Set to True to download the .rar archive
if (True):    
    import urllib.request, urllib.parse, urllib.error
    testfile = urllib.request.FancyURLopener()
    testfile.retrieve(dl_url, local_name)

# Set to True to extract files from the archive
if (True):
    import rarfile
    rar = rarfile.RarFile(local_name)
    rar.printdir()
    rar.extractall()

In [None]:
#Sentiment using positive and negative words
def read_wordlist(fname):
    with open(fname,encoding='latin1') as f:
        wordlist = [line.strip() for line in f.readlines() 
                    if (not line[0]==';') and line.strip()
                   ]
    worddict = dict((word,True) for word in wordlist)
    return worddict

positive_words = read_wordlist('positive-words.txt')
negative_words = read_wordlist('negative-words.txt')

def senti_tokens(tokens):
    sentiment_tokens = []
    for item in tokens:
        if (item in positive_words or item in negative_words):
            sentiment_tokens.append(item)
    return sentiment_tokens

def senti_tokenize(text):
    tokens = nltk.word_tokenize(text)
    sentiment_tokens = senti_tokens(tokens)
    return sentiment_tokens

tfidVectorizersenti = TfidfVectorizer(ngram_range=(1,2),tokenizer=senti_tokenize ,stop_words=nltk.corpus.stopwords.words('english'))
vectorssenti = tfidVectorizersenti.fit_transform(balanced_x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vectorssenti, balanced_y, test_size=0.33, random_state=42)

multinomialNBclassifier = MultinomialNB()
multinomialNBclassifier.fit(X_train, y_train)
preds = multinomialNBclassifier.predict(X_test)

nb_score_senti = accuracy_score(y_test, preds)

In [None]:
def combined_pos_senti_tokens(tokens):
    new_tokens = []
    tagged_items = nltk.pos_tag(tokens)
    for item in tagged_items:
        if item[1] == 'JJ':
            new_tokens.append(item[0])
        elif (item[0] in positive_words or item[0] in negative_words):
            new_tokens.append(item[0])
    return new_tokens

def combined_pos_senti_tokenize(text):
    tokens = nltk.word_tokenize(text)
    new_tokens = combined_pos_senti_tokens(tokens)
    return new_tokens

tfidVectorizercombo = TfidfVectorizer(ngram_range=(1,2),tokenizer=combined_pos_senti_tokenize ,stop_words=nltk.corpus.stopwords.words('english'))
vectorscombo = tfidVectorizercombo.fit_transform(balanced_x)
X_train, X_test, y_train, y_test = train_test_split(vectorscombo, balanced_y, test_size=0.33, random_state=42)

multinomialNBclassifier = MultinomialNB()
multinomialNBclassifier.fit(X_train, y_train)
preds = multinomialNBclassifier.predict(X_test)

nb_score_combo = accuracy_score(y_test, preds)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

D = {u'Multinomial_NB':nb_score*100, u'SVC': svc_score*100, u'Positive_Neg': nb_score_senti*100 
     , u'Stemmed': nb_score_stemmed*100, u'Adjective_POS':nb_score_pos*100 ,
    u'Adjective_Senti':nb_score_combo*100}
plt.figure(figsize=(12,9))
plt.bar(range(len(D)), D.values(), align='center')
plt.xticks(range(len(D)), D.keys())
plt.show()

In [None]:
def load_yelp_business_review(path='yelp_academic_dataset_review.json'):
    business_reviews = {}
    with open(path, "r", encoding="utf-8") as file:
        for line in file:
            jsonline = json.loads(line)
            business_id = jsonline['business_id']
            if business_id in business_reviews.keys():
                new_list = business_reviews[business_id]
                new_list.append(jsonline['text'])
                business_reviews[business_id] = new_list
            else:
                business_reviews[business_id] = [jsonline['text']]
    return business_reviews

In [None]:
business_reviews = load_yelp_business_review()

In [None]:
#check for 100 businesses
limit = 100
count = 0
preds = {}
for key in business_reviews.keys():
    count = count + 1
    total_reviews = 0
    score = 0
    reviews = business_reviews[key]
    for review in reviews:
        total_reviews = total_reviews + 1
        score =score + classifier.predict(tfidVectorizer.transform([review]))
    preds[key] = score/total_reviews
    if count == 100:
        break


In [None]:
#Predictions for 100 Businesses
for key in preds.keys():
    print('{} actual :{} predicted: {}'.format(key, business_stars[key] , preds[key][0]))