In [None]:
# Importing libraries

import nltk
import json
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models.phrases import Phrases, Phraser
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings

nltk.download('stopwords')
nltk.download('wordnet')
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# Define custom stopwords in addition to NLTK's stopwords
custom_stopwords = {'around', 'drop', 'absolutely', 'dropped', 'container', 'recommended', 'came', 'everything', 'say', 'phone', 'home', 'issue', 'fantastic', 'next', 'dumpsters', 'bins', 'project', 'call', 'called', 'pick', 'picked', 'definitely', 'guy', 'would', 'business', 'bin dump', 'dumpster', 'company' 'thank', 'bin', 'get', 'got', 'use', 'great', 'always', 'definitely use', 'also', 'need', 'one', 'needed', 'work', 'nice', 'u', 'said', 'well', 'amazing', 'awesome', 'highly recommend', 'excellent', 'using', 'highly', 'companies', 'dump', 'need', 'bins', 'able', 'really', 'thanks', 'day', 'best', 'recommend', 'used', 'put', 'rental', 'need', 'could', 'happy', 'even', 'told', 'way', 'good' }

# Extend the default NLTK stopwords with your custom stopwords
stop_words = set(stopwords.words('english')).union(custom_stopwords)

In [3]:
# Initialize a lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(document):
    # Lowercase, tokenize, stopword removal, and lemmatization
    words = nltk.word_tokenize(document.lower())
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word.isalnum()]
    return filtered_words

In [4]:
# Path to your JSON file
json_file_path = r"C:\Users\calvi\customer-reviews.json"

# Read the JSON file
with open(json_file_path, 'r', encoding='utf8') as file:
    data = json.load(file)

# Extract documents from the 'texts' key
documents = data['texts']

# Preprocess documents
# original texts = [preprocess_text(document) for document in documents]
texts = [preprocess_text(document) for document in documents if isinstance(document, str)]

In [5]:
# Detect and form bigrams and trigrams
bigram = Phrases(texts, min_count=3, threshold=5)  # higher threshold fewer phrases.
trigram = Phrases(bigram[texts], threshold=5)     # higher threshold fewer phrases.

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# Apply the bigram and trigram phraser to prepared documents
texts = [trigram_mod[bigram_mod[text]] for text in texts]

In [9]:
# Create a dictionary and corpus needed for Topic Modeling
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Set parameters for LDA
num_topics = 5  # The number of topics that the algorithm should find
passes = 10     # The number of passes through the corpus during training

In [None]:
# Create the LDA model
lda_model = gensim.models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

# Display the topics
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [11]:
# Prepare to display the topic model visually
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)