# Steps of LDA (Latent Dirichlet Allocation):
1 .Choose the number of topics (k) you want to extract from the corpus.

2.Preprocess the reviews corpus by removing stop words, punctuations, and converting words to their root forms using stemming or
lemmatization.

3.Create a vocabulary list of all unique words in the corpus.

4.Convert each review in the corpus into a bag-of-words representation, where each word is represented by its index in the vocabulary list and
the count of that word in the review.

5.1nitialize the model by randomly assigning each word in each review to one of the k topics.

6.For each review 'r' in the corpus, iterate through each word w in the review and calculate the probability distribution over the k topics, given the current assignments of all other words in the document to their topics and the current topic-word distribution.

7.Sample a new topic assignment for word w based on the probability distribution calculated in step 6.

8.Repeat steps 6 and 7 for all reviewss in the corpus until convergence is achieved.

9.0utput the topic-word distribution and document-topic distribution as the final result.

In [1]:
!pip install nltk



In [2]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Test\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.5.2-cp310-cp310-win_amd64.whl (12.2 MB)
     ---------------------------------------- 12.2/12.2 MB 3.9 MB/s eta 0:00:00
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4
  Downloading pydantic-1.10.7-cp310-cp310-win_amd64.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 5.2 MB/s eta 0:00:00
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.6-cp310-cp310-win_amd64.whl (480 kB)
     -------------------------------------- 480.9/480.9 kB 6.0 MB/s eta 0:00:00
Collecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.9-cp310-cp310-win_amd64.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 4.2 MB/s eta 0:00:00
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp310-cp310-win_amd64.whl (18 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.8-cp310-cp310-win_amd64.whl (94 kB)
     ---------------------------------------- 94.7/94.7 kB 5.6 MB/s eta 0:00:00
Collecting spacy-loggers<2.0.0

In [None]:
!python -m spacy download en

In [None]:
!pip install gensim==4.2.0

In [None]:
# Gensim
import gensim, spacy, logging, warnings



In [None]:
import gensim.corpora as corpra
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as pit

In [None]:
import re, numpy as np, pandas as pd
from pprint import pprint


# NLJK Stop words
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english' )

In [None]:
import pandas as pd
df=pd.read_csv('McDonald.csv')
df

In [None]:
df = df[["wiI7pd"]]

In [None]:
df.rename(columns = {'wiI7pd':'reviews'}, inplace = True)

In [None]:
df.info()

In [None]:
def sent_to_words(sentences):
  for sent in sentences:
    sent=re.sub('\s+',' ',sent)
    sent=re.sub("\'","",sent)
    sent = gensim.utils.simple_preprocess(str(sent),deacc=True)
    yield(sent)

In [None]:
# convert to list
All_reviews  = df.reviews.values.tolist()
reviews_words=list(sent_to_words(All_reviews))
print(reviews_words[:1])

In [None]:
!pip install spacy

In [None]:
bigram  = gensim.models.Phrases(reviews_words,min_count=5, threshold=10)
trigram = gensim.models.Phrases(bigram[reviews_words],threshold = 10)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod  = gensim.models.phrases.Phraser(trigram)

In [None]:
def process_words(texts,stop_words = stop_words, allowed_postages = ["NOUN",'ADJ','VERB','ADV']):
  texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
  texts = [bigram_mod[doc] for doc in texts]
  texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
  texts_out = []
  nlp = spacy.load('en_core_web_sm',disable=['parser','ner'])
  for sent in texts:
    doc = nlp(" ".join(sent))
    texts_out.append([token.lemma_ for token in doc]) 
  texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]
  return texts_out

In [None]:
data_final=process_words(reviews_words)

In [None]:
data_final[:3 ]

In [None]:
from gensim.corpora import Dictionary
# Create Dictionary
id2word = Dictionary(data_final)
# Create Corpus: Term Document Frequency
corpus=[id2word.doc2bow(text) for text in data_final]
#Build LDA model

lda_model=gensim.models.ldamodel.LdaModel(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=7,
                                        random_state=100,
                                        update_every=1,
                                        chunksize=10,
                                        passes=10,
                                        alpha="symmetric" ,
                                        iterations=100,
                                        per_word_topics=True)

In [None]:
lda_model.print_topics( )

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

cloud = WordCloud(stopwords = stop_words,
                  background_color="white" ,
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap="tab10",
                  color_func= lambda *args,**kwargs:cols[i],
                  prefer_horizontal=1.0)

topics =lda_model.show_topics(formatted=False)

In [None]:
topics[0]

In [None]:
fig, axes = pit.subplots(3, 2,figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
  fig. add_subplot(ax)
  topic_words = dict(topics[i][1])
  cloud.generate_from_frequencies(topic_words,max_font_size=300)
  plt.gca().imshow(cloud)
  plt.gca().set_title('Topic '+str(i),fontdict=dict(size=16))
  plt.gca().axis('off')

pit.subplots_adjust(wspace=0,hspace=0)
plt.axis( 'off' )
plt.margins(x=0, y=0)
pit.tight_layout()
pit.show()

In [None]:
word_dict = {}
for i in range(7):
  words = lda_model.show_topic(i,topn=20)
  word_dict['Topic # '+'{:02d}'.format(i)] = [i[0] for i in words]

pd.DataFrame(word_dict)