In [1]:
# load data
data_dir_path = '/Users/sahilgandhi/Datasets/6120-project/data'
yelp_data_path = '/Users/sahilgandhi/Datasets/yelp_dataset'

tips_data_path = yelp_data_path + '/yelp_academic_dataset_tip.json'

#reviews_data_path = 'C:/Users/Aditya/Documents/GitHub/neu/nlp/LDA_Explore/output/useful_reviews_4.json'

restaurant_data_path = data_dir_path + '/restaurants.json'

stemmed_restaurant_tips_data_path = data_dir_path + '/stemmed_restaurant_tips.json'
stemmed_restaurant_reviews_data_path = data_dir_path + '/stemmed_restaurant_reviews.json'
pos_tagged_restaurant_reviews_data_path = data_dir_path + '/pos_tagged_restaurant_reviews.json'
pos_tagged_restaurant_tips_data_path = data_dir_path + '/pos_tagged_restaurant_tips.json'

# Load Data

In [137]:
import json
import collections
import nltk
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingRegressor

from sklearn import preprocessing

import sklearn

def load_reviews_by_predicate(filename, predicate_per_row):
    X = []
    Y = []
    with open(filename,'r',encoding='utf-8') as f:
        for line in f:
            row = json.loads(line)
            if predicate_per_row(row):
                X.append(row['text'])
                Y.append(str(int(row['stars'])))
    return X, Y

In [138]:
from time import time

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def predict(model, doc_vector):
    return model.transform(doc_vector)

def extract_topics(list_of_docs, vectorizer, vectorizer_params, clf, clf_params):
    doc_vectorizer = vectorizer(**vectorizer_params)
    t0 = time()
    doc_vector = doc_vectorizer.fit_transform(list_of_docs)
    print("vectorizing done in %0.3fs." % (time() - t0))
    topic_model = clf(**clf_params)
    
    
    t0 = time()
    topic_model.fit(doc_vector)
    print("topic modelling done in %0.3fs." % (time() - t0))
#     print(f"\nTopics in {str(clf)} model:")
#     feature_names = doc_vectorizer.get_feature_names()
#     print_top_words(topic_model, feature_names, 10)
    return doc_vectorizer, topic_model

def sentiment_pos_distribution(list_of_docs):
    """ Look at only adjectives and noun POS-word pairs for the document
        Ref:https://link.springer.com/content/pdf/10.1007/978-3-642-22606-9_33.pdf
        Ref:https://www.researchgate.net/publication/272863616_Sentiment_Analysis_of_Movie_Reviews_using_POS_tags_and_Term_Frequencies
    """
    t0 = time()
    word_pairs = set()
    word_pair_features = [defaultdict(lambda: 0)] * len(list_of_docs)
    for idx, doc in enumerate(list_of_docs):
        for sent in nltk.sent_tokenize(doc):
            if sent.strip().count(' ') <= 3: continue  # make sure sent has 3 words atleast
            tagged_sent = nltk.word_tokenize(sent)
            postags = nltk.pos_tag(tagged_sent)
            for word_tag_pairs in nltk.ngrams(postags, 3):
                wtp1, wtp2, wtp3 = word_tag_pairs
                if ((wtp1[1] == 'JJ' and (wtp2[1] == 'NN' or wtp2[1] == 'NNS')) or
                    ((wtp1[1] == 'RB' or wtp1[1] == 'RBR' or wtp1[1] == 'RBS') and (wtp2[1] == 'JJ') and (wtp3[1] != 'NN' or wtp3[1] != 'NNS')) or
                    ((wtp1[1] == 'JJ') and (wtp2[1] == 'JJ') and (wtp3[1] != 'NN' or wtp3[1] != 'NNS')) or
                    ((wtp1[1] == 'NN' or wtp1[1] == 'NNS') and (wtp2[1] == 'JJ') and (wtp3[1] != 'NN' or wtp3[1] != 'NNS')) or
                    ((wtp1[1] == 'RB' or wtp1[1] == 'RBR' or wtp1[1] == 'RBS') and (wtp2[1] == 'VB' or wtp2[1] == 'VBD' or wtp2[1] == 'VBN' or wtp2[1] == 'VBG'))):

                    pair_key = f"{wtp1[0]}-{wtp2[0]}"
                    word_pairs.add(pair_key)
                    word_pair_features[idx][pair_key] += 1

    print("sent features done in %0.3fs." % (time() - t0))
    return (word_pair_features, list(word_pairs))

def generate_pos_distribution_vector(idx, word_pair_features, word_pairs):
    x = []
    for key in word_pairs:
        x.append(word_pair_features[idx][key])
    return np.array(x)

def topic_features_for_docs(list_of_docs, vectorizer, vectorizer_params, clf, clf_params):
    doc_vectorizer, topic_model = extract_topics(list_of_docs, vectorizer, vectorizer_params, clf, clf_params)
    X = predict(topic_model, predict(doc_vectorizer, list_of_docs))
    return preprocessing.scale(X)

def topic_and_sent_features_for_docs(list_of_docs, vectorizer, vectorizer_params, clf, clf_params):
    doc_vectorizer, topic_model = extract_topics(list_of_docs, vectorizer, vectorizer_params, clf, clf_params)
    word_pair_features, word_pairs = sentiment_pos_distribution(list_of_docs)
    X = []
    for idx, doc in enumerate(list_of_docs):
        topic_vector = predict(topic_model, predict(doc_vectorizer, [doc]))
        sent_vector = generate_pos_distribution_vector(idx, word_pair_features, word_pairs)
        X.append(np.concatenate((topic_vector.ravel(), sent_vector)))
    return preprocessing.scale(np.array(X))

In [139]:
%%time
docs, stars = load_reviews_by_predicate(stemmed_restaurant_reviews_data_path, lambda r: r['business_id'] in ['faPVqws-x-5k2CQKDNtHxw', 'DkYS3arLOhA8si5uUEmHOw', 'fL-b760btOaGa85OJ9ut3w', 'K7lWdNUhCbcnEvI0NhGewg', '5shgJB7a-2_gdnzc0gsOtg', 'ujHiaprwCQ5ewziu0Vi9rw', 'XXW_OFaYQkkGOGniujZFHg'])
len(docs), len(stars)

CPU times: user 2.7 s, sys: 201 ms, total: 2.9 s
Wall time: 2.94 s


In [142]:
no_of_topics = 50

X_topic_nmf = topic_and_sent_features_for_docs(
                docs, CountVectorizer, {'stop_words': 'english'},
                NMF, {'n_components': no_of_topics})

X_topic_lda = topic_and_sent_features_for_docs(
                docs, CountVectorizer, {'stop_words': 'english'},
                LatentDirichletAllocation, {'n_components': no_of_topics,
                                        'learning_method': 'online',
                                        'learning_offset': 50.,
                                        'random_state': 0})

print(X_topic_nmf.shape, len(stars))
print(X_topic_lda.shape, len(stars))

vectorizing done in 0.240s.
topic modelling done in 7.345s.
sent features done in 21.555s.
vectorizing done in 0.252s.
topic modelling done in 12.569s.
sent features done in 22.219s.
(2696, 16312) 2696
(2696, 16312) 2696


In [None]:
def score(clf, X_topic, stars, cv):
    print(f"{clf[1]}" + "***"*20)
    scores = cross_val_score(clf[0], X_topic, stars, cv=5, scoring='accuracy')
    print(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print()


clfs = [
#     (sklearn.svm.SVR(),                     'svr   '),
    (sklearn.svm.SVC(kernel='rbf'), 'smv rbf'),
    (sklearn.svm.SVC(kernel='poly'), 'smv poly'),
    (sklearn.linear_model.LogisticRegression(solver='lbfgs', multi_class='multinomial'), 'lg reg'),
#     (sklearn.linear_model.LinearRegression(),            'lr    '),
#     (sklearn.ensemble.AdaBoostRegressor(),  'en adr'),
#     (sklearn.ensemble.BaggingRegressor(),   'en br ')
    ]

for clf in clfs:
    score(clf, X_topic_nmf, stars, 5)

print('--'*30)
for clf in clfs:
    score(clf, X_topic_lda, stars, 5)

smv rbf************************************************************


