<a href='https://ai.meng.duke.edu'> = <img align="left" style="padding-top:10px;" src=https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png>

# Topic Modeling using Defined Topics

In [15]:
import requests
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

## Embed topics and documents and find closest matching topics

In [16]:
def model_topics(documents,candidates, num_topics=1):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Encode each of the reviews
    doc_embeddings = [model.encode([doc]) for doc in documents]
    # Encode the candidate topics
    candidate_embeddings = model.encode(candidates)

    # Calculate cosine similarity between each document and candidate topics
    # Take the top candidate topic as topic for each document
    all_scores = []
    topics = []
    for doc in doc_embeddings:
        scores = cosine_similarity(doc, candidate_embeddings)
        #topic = candidates[scores.argsort()[0][-1]]
        topic = [candidates[index] for index in scores.argsort()[0][-num_topics:]]
        topics.append(topic)
        all_scores.append(scores)
    
    return topics, all_scores

In [17]:
data = pd.read_pickle("data.pkl")
data.head()

Unnamed: 0,Review,Rating,Sentiment,cleaned_reviews
0,"unique, great stay, wonderful time hotel monac...",5,1.0,unique great stay wonderful time hotel monaco ...
1,"great stay great stay, went seahawk game aweso...",5,1.0,great stay great stay go seahawk game awesome ...
2,love monaco staff husband stayed hotel crazy w...,5,1.0,love monaco staff husband stay hotel crazy wee...
3,"cozy stay rainy city, husband spent 7 nights m...",5,1.0,cozy stay rainy city husband spend night monac...
4,"hotel stayed hotel monaco cruise, rooms genero...",5,1.0,hotel stay hotel monaco cruise room generous d...


In [18]:
topic_list = ['Location','Cleanliness', 'Service', 'Food', 'Value', 
              'Restaurant', 'Room', 'Friendly staff', 'Room service', 
              'Walking distance']

In [19]:
topics, all_scores = model_topics(data.cleaned_reviews.values.tolist(), topic_list, num_topics=3)

In [20]:
reviews = data.cleaned_reviews.values.tolist()
for i,keywords in enumerate(topics[:10]):
    print('Review {}:\n {}'.format(i,reviews[i]))
    print()
    print('Topics: {}'.format(topic_list))
    print('Topic Scores: {}'.format(all_scores[i]))
    print('Final Topic: {}'.format(keywords))
    print()

Review 0:
 unique great stay wonderful time hotel monaco location excellent short stroll main downtown shopping area pet friendly room show sign animal hair smell monaco suite sleep area big striped curtain pull closed nice touch feel cosy goldfish name brandi enjoyed n partake free wine coffee tea service lobby think great feature great staff friendly free wireless internet hotel work suite laptop decor lovely eclectic mix patten color palatte animal print bathrobe feel like rock star nice n look like sterile chain hotel hotel personality excellent stay

Topics: ['Location', 'Cleanliness', 'Service', 'Food', 'Value', 'Restaurant', 'Room', 'Friendly staff', 'Room service', 'Walking distance']
Topic Scores: [[0.25144234 0.26877913 0.27726492 0.18494865 0.14613524 0.41625085
  0.3921326  0.25681615 0.44320637 0.03223705]]
Final Topic: ['Room', 'Restaurant', 'Room service']

Review 1:
 great stay great stay go seahawk game awesome downfall view building n complain room huge staff helpful 

In [21]:
data["Topics"] = topics

In [22]:
data.to_csv("topic_modeling_predefined_topics.csv", index=False)