In [None]:
!pip install nltk
import pandas as pd
import re
from nltk.corpus import stopwords

# Load the CSV file
df = pd.read_csv(" ", sep=",", encoding="cp1252", engine='python')

# Extract the 'content' column
a = df["content"]

# Cleaning the text
a = a.str.replace(r'[^\x01-\x7F]', '', regex=True)  # Remove non-ASCII characters
a = a.str.replace(r'http\S+\s*', '', regex=True)     # Remove URLs
a = a.str.replace(r'\bRT\b', '', regex=True)         # Remove 'RT'
a = a.str.replace(r'#', '', regex=True)              # Remove hashtags
a = a.str.replace(r'@\S+', '', regex=True)           # Remove mentions
a = a.str.replace(r'[\x00-\x1F\x7F]', '', regex=True) # Remove control characters
a = a.str.replace(r'\d', '', regex=True)             # Remove digits
a = a.str.replace(r'[^\w\s]', '', regex=True)        # Remove punctuation
a = a.str.replace(r'^\s*', '', regex=True)           # Remove leading whitespace
a = a.str.replace(r'\s*$', '', regex=True)           # Remove trailing whitespace

# Convert to lowercase
a = a.str.lower()

# Define specific stopwords
custom_stopwords = ["anti flu shot", "antiflushot", "flu shot", "flushot", "flu season", "fluseason", "flu vaccine", "fluvaccine", "flu vaccination", "fluvaccination", "influenza vaccine", "influenzavaccine", "influenza vaccination", "influenzavaccination", "flu", "influenza"]

# Remove specific stopwords
def remove_custom_stopwords(text):
    return ' '.join(word for word in text.split() if word not in custom_stopwords)

a = a.apply(remove_custom_stopwords)

# Create a new DataFrame with the cleaned tweets
df1 = pd.DataFrame(a, columns=["content"])

In [None]:
!pip install --upgrade --user numpy scipy gensim

In [None]:
import scipy
import gensim

# Additional preprocessing of the cleaned tweets
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
data = df1.content.values.tolist()
data_words = list(sent_to_words(data))

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
#Remove stopwords and lemmatize the text
import nltk 
nltk.download('words')
words = set(nltk.corpus.words.words())
nltk.download('stopwords')  
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import spacy 
from gensim.utils import simple_preprocess

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

nlp = spacy.load('en_core_web_sm')
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags and len(token)>3] )
    return texts_out
data_words_nostops = remove_stopwords(data_words)
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) 

print(data_lemmatized[:1])

In [None]:
# Remove commas and merge each inner list into a single string
cleaned_data = [" ".join(item).replace(",", "") for item in data_lemmatized]

# Convert the list of cleaned strings to a DataFrame
df2 = pd.DataFrame(cleaned_data, columns=["content"])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

#Create Bag of Words representation of the tweets
vectorizer = CountVectorizer(analyzer='word', min_df=10, stop_words='english', lowercase=True, token_pattern='[a-zA-Z0-9]{3,}')
data_vectorized = vectorizer.fit_transform(df2["content"])
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

In [None]:
from scipy.sparse.linalg import svds
from sklearn.mixture import GaussianMixture

#Calculate AIC scores
def get_gmm_labels(data_vectorized, k):
    gmm = GaussianMixture(n_components=k, max_iter=200, random_state=37)
    gmm.fit(data_vectorized)
    aic = gmm.aic(data_vectorized)
    print('{}: aic={}'.format(k, aic))
    return k, aic
data_vectorized = data_vectorized.asfptype()

U, S, V = svds(data_vectorized, k=20)
gmm_scores_aic = [get_gmm_labels(U, k) for k in range(2, 51)]

In [None]:
from scipy.sparse.linalg import svds
from sklearn.mixture import GaussianMixture

#Calculate BIC scores
def get_gmm_labels(data_vectorized, k):
    gmm = GaussianMixture(n_components=k, max_iter=200, random_state=37)
    gmm.fit(data_vectorized)
    bic = gmm.bic(data_vectorized)
    print('{}: bic={}'.format(k, bic))
    return k, bic
data_vectorized = data_vectorized.asfptype()

U, S, V = svds(data_vectorized, k=20)
gmm_scores_bic = [get_gmm_labels(U, k) for k in range(2, 51)]

In [None]:
#Plot AIC and BIC scores
def plot_scores(scores, ax, ylabel):
    _x = [s[0] for s in scores]
    _y = [s[1] for s in scores]

    ax.plot(_x, _y, color='tab:blue')
    ax.set_xlabel('k')
    ax.set_ylabel(ylabel)
    ax.set_title('{} vs k'.format(ylabel))

fig, ax = plt.subplots(1,2, figsize=(15, 5))
plot_scores(gmm_scores_aic, ax[0], 'GMM AIC')
plot_scores(gmm_scores_bic, ax[1], 'GMM BIC')
plt.tight_layout()

In [None]:
# Generate topics based on the no. of optimal topics which is '7' in this case 
dictionary = gensim.corpora.Dictionary(data_lemmatized)
bow_corpus = [dictionary.doc2bow(doc) for doc in data_lemmatized]
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary,random_state=0)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
# Find dominant topic in each sentence of the text
def format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=data_lemmatized):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
     # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=data_lemmatized)
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)