<a href='https://ai.meng.duke.edu'> = <img align="left" style="padding-top:10px;" src=https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png>

In [7]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

In [8]:
data = pd.read_pickle("data.pkl")
review_list = data.Review.values.tolist()
data.head()

Unnamed: 0,Review,Rating,Sentiment,cleaned_reviews
0,"unique, great stay, wonderful time hotel monac...",5,1.0,unique great stay wonderful time hotel monaco ...
1,"great stay great stay, went seahawk game aweso...",5,1.0,great stay great stay go seahawk game awesome ...
2,love monaco staff husband stayed hotel crazy w...,5,1.0,love monaco staff husband stay hotel crazy wee...
3,"cozy stay rainy city, husband spent 7 nights m...",5,1.0,cozy stay rainy city husband spend night monac...
4,"hotel stayed hotel monaco cruise, rooms genero...",5,1.0,hotel stay hotel monaco cruise room generous d...


## Find candidate topics (Nouns)

In [11]:
# Extract candidate 1-grams and 2-grams 
n_gram_range = (1, 2)
vectorizer = CountVectorizer(ngram_range=n_gram_range, stop_words=stopwords.words('english'))
vectorizer.fit(review_list)
candidates = vectorizer.get_feature_names_out()

# Get noun phrases and nouns from articles
nlp = spacy.load('en_core_web_sm')
all_nouns = set()
for doc in review_list:
    doc_processed = nlp(doc)
    # Add noun chunks
    all_nouns.add(chunk.text.strip().lower() for chunk in doc_processed.noun_chunks)
    # Add nouns
    for token in doc_processed:
            if token.pos_ == "NOUN":
                all_nouns.add(token.text)

# Filter candidate topics to only those in the nouns set
candidates = [c for c in candidates if c in all_nouns]

## Embed candidates and documents and find matching topics

In [12]:
def model_topics(documents, candidates, num_topics):
    #model = SentenceTransformer('all-MiniLM-L6-v2')
    model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
    # Encode each of the reviews
    doc_embeddings = [model.encode([doc]) for doc in documents]
    # Encode the candidate topics
    candidate_embeddings = model.encode(candidates)

    # Calculate cosine similarity between each document and candidate topics
    # Take the top candidate topics as keywords for each document
    review_keywords = []
    for doc in doc_embeddings:
        scores = cosine_similarity(doc, candidate_embeddings)
        keywords = [candidates[index] for index in scores.argsort()[0][-num_topics:]]
        review_keywords.append(keywords)
    
    return review_keywords

In [13]:
topics = model_topics(data.Review.values.tolist(), candidates, num_topics=5)

data["Topic Keywords"] = topics

In [15]:
for i,keywords in enumerate(topics[:10]):
    print(review_list[i])
    print('Topic keywords: {}'.format(keywords))
    print("\n")

unique, great stay, wonderful time hotel monaco, location excellent short stroll main downtown shopping area, pet friendly room showed no signs animal hair smells, monaco suite sleeping area big striped curtains pulled closed nice touch felt cosy, goldfish named brandi enjoyed, did n't partake free wine coffee/tea service lobby thought great feature, great staff friendly, free wireless internet hotel worked suite 2 laptops, decor lovely eclectic mix pattens color palatte, animal print bathrobes feel like rock stars, nice did n't look like sterile chain hotel hotel personality excellent stay,  
Topic keywords: ['villas', 'hotelthe', 'luxurious', 'hotels', 'hotel']


great stay great stay, went seahawk game awesome, downfall view building did n't complain, room huge staff helpful, booked hotels website seahawk package, no charge parking got voucher taxi, problem taxi driver did n't want accept voucher barely spoke english, funny thing speak arabic called started making comments girlfrien

In [16]:
data.to_csv("topics_from_transformer_nouns.csv", index=False)