In [None]:
!pip install nltk
import pandas as pd
import re
from nltk.corpus import stopwords

# Load the CSV file
df = pd.read_csv(" ", sep=",")

# Extract the 'content' column
a = df["content"]

# Cleaning the text
a = a.str.replace(r'[^\x01-\x7F]', '', regex=True)  # Remove non-ASCII characters
a = a.str.replace(r'http\S+\s*', '', regex=True)     # Remove URLs
a = a.str.replace(r'\bRT\b', '', regex=True)         # Remove 'RT'
a = a.str.replace(r'#', '', regex=True)              # Remove hashtags
a = a.str.replace(r'@\S+', '', regex=True)           # Remove mentions
a = a.str.replace(r'[\x00-\x1F\x7F]', '', regex=True) # Remove control characters
a = a.str.replace(r'\d', '', regex=True)             # Remove digits
a = a.str.replace(r'[^\w\s]', '', regex=True)        # Remove punctuation
a = a.str.replace(r'^\s*', '', regex=True)           # Remove leading whitespace
a = a.str.replace(r'\s*$', '', regex=True)           # Remove trailing whitespace

# Convert to lowercase
a = a.str.lower()

# Define specific stopwords
custom_stopwords = ["covid-19 vaccine", "covid", "vaccine", "vaccines", "coronavirus", "covidvaccine"]

# Remove specific stopwords
def remove_custom_stopwords(text):
    return ' '.join(word for word in text.split() if word not in custom_stopwords)

#a = a.apply(remove_custom_stopwords)

# Create a new DataFrame with the cleaned tweets
df['cleaned_content'] = a.apply(remove_custom_stopwords)

In [None]:
!pip install gensim
import gensim

# Additional preprocessing of the cleaned tweets
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
data = df.cleaned_content.values.tolist()
data_words = list(sent_to_words(data))

In [None]:
#Remove stopwords and lemmatize the text
import nltk 
nltk.download('words')
words = set(nltk.corpus.words.words())
nltk.download('stopwords')  
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import spacy 
#def remove_stopwords(texts):
 #   return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

nlp = spacy.load('en_core_web_sm')
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags and len(token)>3] )
    return texts_out
#data_words_nostops = remove_stopwords(data_words)
#data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) 
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) 

print(data_lemmatized[:1])

In [None]:
# Remove commas and merge each inner list into a single string
cleaned_data = [" ".join(item).replace(",", "") for item in data_lemmatized]

# Convert the list of cleaned strings to a DataFrame
df["lemmatized_text"] = cleaned_data
df = df[df["lemmatized_text"] != ""]

In [None]:
# Ensure the 'date' column is in datetime format
df["date_e"] = pd.to_datetime(df["date"])
# Extract the month number
df["month"] = df["date_e"].dt.month
# If you want the month name instead of the number, use:
df["month_name"] = df["date_e"].dt.strftime("%B")
# Display the updated DataFrame
print(df.head())

In [None]:
data = df.lemmatized_text.values.tolist()
data_words = list(sent_to_words(data))

#Create TF-IDF representation of the tweets
from gensim import corpora, models
dictionary = gensim.corpora.Dictionary(data_words)
bow_corpus = [dictionary.doc2bow(doc) for doc in data_words]
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
import matplotlib.pyplot as plt
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#Generate Coherence plot to find the optimal number of topics
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=gensim.models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=data_words, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus_tfidf, texts=data_words, start=2, limit=40, step=6)
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus
import numpy
from gensim.matutils import hellinger
month_counts = df["month_name"].value_counts().sort_index()
dec=month_counts[0]
jan=month_counts[1]
feb=month_counts[2]
# Create time slices based on the number of tweets in each month. Replace the values with number of records for each time slice in your dataset
time_slice = [dec,jan,feb]

# Train the LDA model with the optimal number of topics. Replace 'num_topics=8' with the number of optimal topics identified in your dataset
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus_tfidf, id2word=dictionary, time_slice=time_slice, num_topics=8, chain_variance=1)

In [None]:
# Print first topic for all three time slices. Change 'topic=0' with any other topic number to print that topic
ldaseq.print_topic_times(topic=0)

In [None]:
# Print all the topic for first time slice. Change 'time=0' with any other time slice number to print topics for that time slice
ldaseq.print_topics(time=0)

In [None]:
from gensim.models import LdaSeqModel

# Save the LDA model
ldaseq.save('ldaseq')

# If we've saved before simply load the model
dtm_model = LdaSeqModel.load('ldaseq')
topics_dtm = dtm_model.print_topic_times(topic=0)
topics_dtm

In [None]:
# Identify topic number for each tweet
num_rows = len(bow_corpus)
max_index=[]
for j in range(num_rows):
    doc = dtm_model.doc_topics(j)
    doc_lis=doc.tolist()
    max_value = max(doc_lis) 
    max_index.append(doc_lis.index(max_value))
    
df3 = pd.DataFrame(max_index, columns=["topic_number"])

In [None]:
# Merge df and df3
df = pd.concat([df, df3], axis=1)

In [None]:
# Visualize distribution of each topic for each time slice. Replace 'time=0' with the time sloce for which you need to generate visualization of the topics
import pyLDAvis
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=0, corpus=bow_corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)