In [4]:
# load data
tips_data_path = 'C:/Users/Aditya/Documents/GitHub/neu/nlp/LDA_Explore/input/yelp_academic_dataset_tip.json'
reviews_data_path = 'C:/Users/Aditya/Documents/GitHub/neu/nlp/LDA_Explore/output/useful_reviews_4.json'
restaurant_data_path = 'C:/Users/Aditya/Documents/GitHub/neu/nlp/LDA_Explore/output/restaurants.json'
stemmed_restaurant_tips_data_path = 'output/stemmed_restaurant_tips.json'
stemmed_restaurant_reviews_data_path = 'output/stemmed_restaurant_reviews.json'
pos_tagged_restaurant_reviews_data_path = 'output/pos_tagged_restaurant_reviews.json'
pos_tagged_restaurant_tips_data_path = 'output/pos_tagged_restaurant_tips.json'

In [5]:
import json

import nltk
ps = nltk.PorterStemmer()
def get_stemmed_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [ps.stem(x) for x in tokens]
    return ' '.join(stemmed_tokens)


def get_restaurant_ids(filename):
    restaurant_ids = []
    with open(filename,'r',encoding='utf-8') as f:
        for line in f:
            row = json.loads(line)
            business_id = row['business_id']
            restaurant_ids.append(business_id)
    
    return set(restaurant_ids)

def get_restaurants_tips_reviews_data(filename,restaurant_ids,keys):
    tip_review_data_list = []
    with open(filename,'r',encoding='utf-8') as f:
        i = 0
        for line in f:
            if i % 10000 == 0:
                print(i)
            i+=1
            row = json.loads(line)
            business_id = row['business_id']
            if business_id in restaurant_ids:                
                row['text'] = get_stemmed_text(row['text'])
                required_row = {key:row[key] for key in keys}
                tip_review_data_list.append(required_row)
    
    return tip_review_data_list

def write_stemmed_text_for_tips(output_file_path,tip_review_data_list):
    with open(output_file_path,'w',encoding='utf-8') as f:
        for tip_review_data in tip_review_data_list:
            f.write(json.dumps(tip_review_data) + '\n')

In [6]:
restaurant_ids = get_restaurant_ids(restaurant_data_path)
# business_tip_text = get_restaurants_tips_reviews_data(tips_data_path,restaurant_ids,['business_id','text'])
# write_stemmed_text_for_tips(stemmed_restaurant_tips_data_path,business_tip_text)
print('hello')

hello


In [7]:
#business_review_text = get_restaurants_tips_reviews_data(reviews_data_path,restaurant_ids,['business_id','review_id','text','stars'])
# write_stemmed_text_for_tips(stemmed_restaurant_reviews_data_path,business_review_text)

In [8]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def extract_topics_using_lda(list_of_text,vectorizer,n_components,max_iter):
    doc_vectorizer = vectorizer(stop_words='english')
    t0 = time()
    doc_vector = doc_vectorizer.fit_transform(list_of_text)
    print("done in %0.3fs." % (time() - t0))
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=max_iter,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
    
    t0 = time()
    lda.fit(doc_vector)
    print("done in %0.3fs." % (time() - t0))
    print("\nTopics in LDA model:")
    feature_names = doc_vectorizer.get_feature_names()
    print_top_words(lda, feature_names, 10)
    return doc_vectorizer,lda

def extract_topics_using_nfm(list_of_text,vectorizer,n_components,max_iter):
    doc_vectorizer = vectorizer(stop_words='english')
    t0 = time()
    doc_vector = doc_vectorizer.fit_transform(list_of_text)
    print("done in %0.3fs." % (time() - t0))
    nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=max_iter, alpha=.1,
          l1_ratio=.5)
    nmf.fit(doc_vector)
    print("done in %0.3fs." % (time() - t0))
    print("\nTopics in NMF model:")
    feature_names = doc_vectorizer.get_feature_names()
    print_top_words(lda, feature_names, 10)
    return doc_vectorizer,lda

In [9]:
from collections import defaultdict
from nltk import pos_tag

def load_stemmed_user_reviews_by_business(filename):
    business_reviews = defaultdict(list)
    with open(filename,'r',encoding='utf-8') as f:
        for line in f:
            row = json.loads(line)
            business_id = row['business_id']
            business_reviews[business_id].append(row['text'])
    
    return business_reviews

def transform_review_text_pos_tags(input_filename, output_filename):
    business_reviews = []
    with open(input_filename, 'r', encoding='utf-8') as f:
        i = 0
        for line in f:
            if i % 100 == 0:
                print(i)

            i += 1
            row = json.loads(line)
            pos_tags = pos_tag(row['text'].split(' '))
            joined_pos_tags = [x[0] + '/' + x[1] for x in pos_tags]
            row['text'] = ' '.join(joined_pos_tags)
            business_reviews.append(row)

    with open(output_filename, 'w', encoding='utf-8') as f:
        for row in business_reviews:
            f.write(json.dumps(row) + '\n')

In [10]:
#transform_review_text_pos_tags(stemmed_restaurant_reviews_data_path,pos_tagged_restaurant_reviews_data_path)

In [11]:
business_reviews = load_stemmed_user_reviews_by_business(stemmed_restaurant_reviews_data_path)
reviews_for_one_business = business_reviews['YTbKmjGTdn4YzoJXTC1u7g']
doc_vectorizer,lda = extract_topics_using_lda(reviews_for_one_business,CountVectorizer,10,5)
review_doc_vector = doc_vectorizer.transform(reviews_for_one_business)
review_topic_vector = lda.transform(review_doc_vector)
print(review_topic_vector)

done in 0.016s.
done in 0.350s.

Topics in LDA model:
Topic #0: make time good did manag thi use like great flipsid
Topic #1: wa play card kid parti laser tag game room time
Topic #2: wa 50 think 12 food ball anoth spouse tool weekend
Topic #3: room attract son line wa teenag packag includ chao wife
Topic #4: time lane employe anoth told bowl minut refund wa thi
Topic #5: kid accord delici like longer confus highli child sell ok
Topic #6: wa kid game bowl thi laser place like lane time
Topic #7: adult wait tag lack thi experi laser ha child wa
Topic #8: bowl wa thi time good kid unlimit place old say
Topic #9: ve bring confront fun dure burger surfac prize care area

[[  3.57208869e-03   3.57208820e-03   3.57214001e-03   3.57209823e-03
    3.57216974e-03   3.57213634e-03   9.67850859e-01   3.57212745e-03
    3.57216619e-03   3.57212630e-03]
 [  1.02060850e-03   1.02057859e-03   1.02055187e-03   1.02054904e-03
    1.02059978e-03   1.02055303e-03   1.02072891e-03   1.02057446e-03
    9.9

In [14]:
business_reviews = load_stemmed_user_reviews_by_business(stemmed_restaurant_reviews_data_path)
reviews_for_one_business = business_reviews['YTbKmjGTdn4YzoJXTC1u7g']
doc_vectorizer,nfm = extract_topics_using_nfm(reviews_for_one_business,TfidfVectorizer,10,5)
review_doc_vector = doc_vectorizer.transform(reviews_for_one_business)
review_topic_vector = nfm.transform(review_doc_vector)
print(review_topic_vector)

done in 0.015s.
done in 0.029s.

Topics in NMF model:
Topic #0: make time good did manag thi use like great flipsid
Topic #1: wa play card kid parti laser tag game room time
Topic #2: wa 50 think 12 food ball anoth spouse tool weekend
Topic #3: room attract son line wa teenag packag includ chao wife
Topic #4: time lane employe anoth told bowl minut refund wa thi
Topic #5: kid accord delici like longer confus highli child sell ok
Topic #6: wa kid game bowl thi laser place like lane time
Topic #7: adult wait tag lack thi experi laser ha child wa
Topic #8: bowl wa thi time good kid unlimit place old say
Topic #9: ve bring confront fun dure burger surfac prize care area

[[ 0.01658713  0.01658715  0.01658751  0.01658726  0.01658763  0.01658745
   0.85071369  0.01658729  0.01658745  0.01658744]
 [ 0.01219573  0.01219542  0.0121952   0.01219512  0.01219566  0.0121952
   0.0121966   0.01219541  0.89024049  0.01219517]
 [ 0.0110339   0.01103378  0.01103355  0.01103348  0.90069453  0.01103351
 



In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def extract_topics_using_lda(list_of_text,vectorizer,no_of_topics):
    pass