In [None]:
# Read csv
import pandas as pd
df=pd.read_csv(" ",engine='python',encoding='cp1252',na_filter=False)

In [None]:
#Drop rows where 'content' column has blanks
df = df[df['content'].str.strip() != '']

In [None]:
import string
# Function to check if a string contains only punctuation
def is_only_punctuation(text):
    return all(char in string.punctuation for char in text)
# Remove rows where 'content' contains only punctuation
df = df[~df['content'].apply(is_only_punctuation)]

In [None]:
#Remove special characters and punctuations
import regex
df['content'] = df['content'].str.replace('[^\w\s]','',regex=True)

In [None]:
# Remove punctuations, stopwords from 'content' and lemmatize text
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
data = df.content.values.tolist()
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])
import nltk 
nltk.download('words')
words = set(nltk.corpus.words.words())
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#Add custom stop words in the list; for example stop_words.extend(['student', 'read', 'write'])
stop_words.extend([])
import spacy 
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

nlp = spacy.load('en_core_web_sm')
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags and len(token)>3])
    return texts_out
data_words_nostops = remove_stopwords(data_words)
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'] )
print(data_lemmatized)

In [None]:
#Remove empty lists
filtered_list = [x for x in data_lemmatized if x]
# Create bow corpus
dictionary = gensim.corpora.Dictionary(data_lemmatized)
bow_corpus = [dictionary.doc2bow(doc) for doc in data_lemmatized]

In [None]:
# Find optimal no. of topics using Coherence score
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary,random_state=0)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=filtered_list, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=filtered_list, start=2, limit=40, step=1)
limit=40; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Generate topics based on the no. of optimal topics which is '10' in this case 
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary,random_state=0)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
# Find dominant topic in each sentence of the text
def format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=filtered_list):
    # Init output
    sent_topics_list = []
    
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        if row:
            topic_num, prop_topic = row[0]  # Dominant topic
            wp = ldamodel.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_list.append([int(topic_num), round(prop_topic, 4), topic_keywords])
    
    sent_topics_df = pd.DataFrame(sent_topics_list, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])
    
    # Add original text to the end of the output
    sent_topics_df = pd.concat([sent_topics_df, pd.Series(texts, name='Text')], axis=1)
    
    return sent_topics_df

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=filtered_list)
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
# Generate word cloud for documents belonging to each topic
import numpy as np
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

# Set the number of topics
num_topics = 10

# Define the number of rows and columns for subplots
rows = 2  # Adjust as needed
cols = 5  # Adjust as needed

# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(20, 10))
fig.suptitle("Word Clouds for All Topics", fontsize=16)

# Flatten axes for easy iteration
axes = axes.flatten()

for topic_num in range(num_topics):
    # Filter the DataFrame for the current topic
    df_topic = df_dominant_topic[df_dominant_topic['Dominant_Topic'] == topic_num] 
    text = df_topic['Text'].to_string()
    
    # Generate word cloud
    wordcloud = WordCloud(max_words=50, background_color='#FFFFFF').generate(text)
    
    # Display word cloud
    axes[topic_num].imshow(wordcloud, interpolation='bilinear')
    axes[topic_num].axis("off")
    axes[topic_num].set_title(f"Topic {topic_num}")

# Adjust layout to prevent overlap
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
#Find words that provide significant differentiation between the topics selected based on the multinomial logistic regression model using the forward entry method.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.stats import chi2

def compute_chi_square(X, y):
    chi2_values = []
    p_values = []
    
    for i in range(X.shape[1]):
        X_word = X[:, i].reshape(-1, 1)  # Single word feature
        model = LogisticRegression(solver='lbfgs')
        model.fit(X_word, y)
        
        # Compute log-likelihood
        log_likelihood_full = np.sum(np.log(model.predict_proba(X_word)[range(len(y)), y]))
        
        # Null model (without this word)
        null_model = LogisticRegression(solver='lbfgs')
        null_model.fit(np.ones((X.shape[0], 1)), y)
        log_likelihood_null = np.sum(np.log(null_model.predict_proba(np.ones((X.shape[0], 1)))[range(len(y)), y]))
        
        # Chi-square statistic
        chi2_stat = 2 * (log_likelihood_full - log_likelihood_null)
        p_value = 1 - chi2.cdf(chi2_stat, df=1)
        chi2_values.append(chi2_stat)
        p_values.append(p_value)
    
    return chi2_values, p_values

def select_significant_words(X, y, significance_level=0.05):
    chi2_values, p_values = compute_chi_square(X, y)
    significant_features = [i for i, p in enumerate(p_values) if p < significance_level]
    return significant_features, chi2_values, p_values

df_dominant_topic["Text2"] = df_dominant_topic["Text"].apply(lambda x: " ".join(eval(x)) if isinstance(x, str) else " ".join(x))
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_dominant_topic['Text2']).toarray()
y = np.array(df_dominant_topic['Dominant_Topic'])

# Perform feature selection
selected_indices, chi2_values, p_values = select_significant_words(X, y, significance_level=0.05)
selected_words = [vectorizer.get_feature_names_out()[i] for i in selected_indices]

# Print results
for i, word in enumerate(vectorizer.get_feature_names_out()):
    print(f"Word: {word}, Chi-square: {chi2_values[i]:.4f}, p-value: {p_values[i]:.4f}")

print("Selected words providing significant differentiation:", selected_words)

In [None]:
#Find the percentage of memos within each topic that contained each distinguishing word.
vectorizer = CountVectorizer(vocabulary=selected_words, binary=True)
X = vectorizer.fit_transform(df_dominant_topic['Text2'])

# Convert to DataFrame for easy analysis
word_presence = pd.DataFrame(X.toarray(), columns=selected_words)
df_dominant_topic = df_dominant_topic.reset_index(drop=True)
df_word_presence = pd.concat([df_dominant_topic[['Dominant_Topic']], word_presence], axis=1)

# Calculate proportions: For each topic, compute the percentage of memos containing each word
word_proportions = df_word_presence.groupby('Dominant_Topic').mean().T*100

# Display results
print(word_proportions)
