In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('UA_reviews.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df

In [None]:
import re 
def review_cleaner(reviews):
    pattern = r'[^\w\s]'
    reviews = re.sub(pattern, '', reviews)
    reviews = re.sub('✅', '', reviews) # remove checkmark emojis
    reviews = re.sub('❌', '', reviews) # remove X emojis
    reviews = re.sub('Trip Verified', '', reviews) # remove Trip Verified
    reviews = re.sub('Not Verified\s*\|?\s*', '', reviews) # remove Not Verified
    reviews = re.sub('Verified Review', '', reviews)
    reviews = re.sub(r'[^\w\s]', '', reviews)
    reviews = reviews.replace('|', '') # Remove Vertical Bar
    reviews = reviews.lower() # Lowercase the text
    return reviews

df['clean_reviews'] = df['reviews'].apply(review_cleaner)
df

In [None]:
## Removing Stopwords
import nltk 
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

df['reviews_wo_stopwords'] = df['clean_reviews'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))
df

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    tokens = word_tokenize(text)  # Tokenize the text
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize each token
    return ' '.join(lemmatized_tokens)

df['reviews_lemmatized'] = df['reviews_wo_stopwords'].apply(lemmatize_text)
df

In [None]:
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

reviews_lemmatized = df['reviews_lemmatized'].tolist()

reviews_lemmatized = [text.split() for text in reviews_lemmatized]

def preprocess_text(text):
    return gensim.utils.simple_preprocess(text)

processed_data = df['clean_reviews'].apply(preprocess_text)

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(doc) for doc in processed_data]

# Build LDA model
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=10, chunksize=500)

# Print top topics and top words in each topic
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')

# Assign topics to documents
for i, doc in enumerate(corpus):
    topics = lda_model.get_document_topics(doc)
    print(f"Document {i}: {topics}")

In [None]:
import pandas as pd
import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary

# Preprocess text data
processed_data = df['reviews_lemmatized'].apply(preprocess_text)

# Load the trained LDA model and dictionary
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=10, chunksize=500)
dictionary = corpora.Dictionary(processed_data)

# Function to get top 5 topics for each document
def get_top_topics(doc):
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    top_topics = sorted(topics, key=lambda x: x[1], reverse=True)[:5]  # Get top 5 topics
    return top_topics

# Apply the function to each row of the DataFrame
df['top_topics'] = processed_data.apply(get_top_topics)

# Optionally, retrieve the top words associated with each topic
def get_top_words(topics):
    return [(lda_model.show_topic(topic[0], topn=5)) for topic in topics]

df['top_words'] = df['top_topics'].apply(get_top_words)

# Display the DataFrame with top topics and words
print(df[['reviews_lemmatized', 'top_topics', 'top_words']])


In [None]:
topics - 10
import gensim
from gensim.models import CoherenceModel

# Assuming lda_model is your trained LDA model


# Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')