In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

import re

from sklearn.feature_extraction.text import TfidfVectorizer

from collections import Counter

from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [6]:
data = pd.read_csv("data/choiceboard_data.csv")[["CPD_Q1", "CPD_Q3"]]
data = data.dropna()
print('There are ' + str(data.shape[0]) + ' rows.')
data.head()

There are 104 rows.


Unnamed: 0,CPD_Q1,CPD_Q3
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...


In [7]:
stopwords = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Lowercase
    text = text.lower() 

    # Tokenize
    tokens = word_tokenize(text)  

    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]

    # Stemming
    tokens = [stemmer.stem(word) for word in tokens] 
    
    # Return preprocessed list of tokens
    return tokens

data['CPD_Q1 Clean'] = data['CPD_Q1'].apply(preprocess)
data['CPD_Q3 Clean'] = data['CPD_Q3'].apply(preprocess)

data['CPD_Q1 Word Count'] = data['CPD_Q1'].apply(lambda x: len(x.split()))
data['CPD_Q3 Word Count'] = data['CPD_Q3'].apply(lambda x: len(x.split()))

data['CPD_Q1 Clean Word Count'] = data['CPD_Q1 Clean'].apply(lambda x: len(x))
data['CPD_Q3 Clean Word Count'] = data['CPD_Q3 Clean'].apply(lambda x: len(x))
data.head()

Unnamed: 0,CPD_Q1,CPD_Q3,CPD_Q1 Clean,CPD_Q3 Clean,CPD_Q1 Word Count,CPD_Q3 Word Count,CPD_Q1 Clean Word Count,CPD_Q3 Clean Word Count
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...,"[class, time, devot, health, well, self, care,...","[believ, self, care, realli, import, live, hea...",88,37,42,20
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...,"[class, time, devot, health, well, self, care,...","[time, spent, choic, self, care, allow, relax,...",78,26,39,15
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...,"[class, time, remind, import, self, care, also...","[time, spent, choic, self, care, affect, posit...",91,63,39,30
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...,"[class, time, devot, health, well, self, care,...","[made, relax, le, stress, upcom, futur, exam, ...",38,34,16,13
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...,"[learnt, listen, peopl, without, interrupt, le...","[lifelin, alway, felt, like, wast, time, walk,...",58,50,25,23


In [8]:
x = data['CPD_Q1'].tolist()
y = data['CPD_Q1'].tolist()

list_of_lists = [string.split() for string in x]
list_of_lists2 = [string.split() for string in y]

In [9]:
data.head()
modelling_data = data[['CPD_Q1', 'CPD_Q3']]
modelling_data.iloc[:,0]

corpus = modelling_data["CPD_Q1"].tolist()

# Initialize a TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the corpus and transform the corpus into a DTM
dtm = vectorizer.fit_transform(corpus)

# Extract the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Initialize a list to hold the feature names for each response
features_per_response = []

# Iterate over each response in the DTM
for response_idx in range(dtm.shape[0]):
    # Find the indices of non-zero features for this response
    non_zero_indices = dtm[response_idx].nonzero()[1]
    # Map these indices to the actual feature names
    response_features = [feature_names[idx] for idx in non_zero_indices]
    # Add the feature names for this response to the list
    features_per_response.append(response_features)

# 'features_per_response' now contains the list of feature names for each response

In [10]:
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

# Function to train NMF models for a range of topic numbers
def train_nmf_models(doc_term_matrix, n_topics_list):
    nmf_models = {}
    for n_topics in n_topics_list:
        nmf = NMF(n_components=n_topics, random_state=42)
        W = nmf.fit_transform(doc_term_matrix)  # Document-topic matrix
        nmf_models[n_topics] = (nmf, W)
    return nmf_models

# Function to convert NMF topics to a format compatible with Gensim
def nmf_topics_to_gensim_format(nmf_model, feature_names, n_words=10):
    topics = []
    for topic_idx, topic in enumerate(nmf_model.components_):
        top_feature_indices = topic.argsort()[:-n_words - 1:-1]
        topic_words = [feature_names[i] for i in top_feature_indices]
        topics.append(topic_words)
    return topics

# Function to compute coherence scores for NMF models
def compute_coherence_scores(nmf_models, texts, feature_names):
    coherence_scores = {}
    for n_topics, (nmf_model, _) in nmf_models.items():
        topics = nmf_topics_to_gensim_format(nmf_model, feature_names)
        
        # Create a Gensim dictionary
        dictionary = Dictionary(texts)
        # Convert texts to Gensim format corpus
        corpus = [dictionary.doc2bow(text) for text in texts]
        
        # Compute Coherence Score using c_v measure
        coherence_model = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        coherence_scores[n_topics] = coherence_score
    
    return coherence_scores

# Main script to find the optimal number of topics
if __name__ == "__main__":
    # Your document-term matrix and associated vocabulary
    doc_term_matrix = corpus
    feature_names = features_per_response

    # Initialize a TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer to the corpus and transform the corpus into a DTM
    doc_term_matrix = vectorizer.fit_transform(corpus)

    # Extract the feature names (vocabulary) from the vectorizer
    feature_names = vectorizer.get_feature_names_out()

    # Range of topics to evaluate
    n_topics_list = range(2, 21)

    # Train NMF models
    nmf_models = train_nmf_models(doc_term_matrix, n_topics_list)

    # Convert your corpus to a list of lists of words for coherence score calculation
    texts = [doc.split() for doc in corpus]

    # Compute coherence scores
    coherence_scores = compute_coherence_scores(nmf_models, texts, feature_names)

    # Find the number of topics with the highest coherence score
    optimal_n_topics = max(coherence_scores, key=coherence_scores.get)
    print(f"Optimal number of topics: {optimal_n_topics}, Coherence Score: {coherence_scores[optimal_n_topics]}")



Optimal number of topics: 4, Coherence Score: 0.5569449880453113


### Q3 OPTIMAL TOPICS ###

In [11]:
data.head()
modelling_data = data[['CPD_Q1', 'CPD_Q3']]
modelling_data.iloc[:,0]

corpus2 = modelling_data["CPD_Q3"].tolist()

# Initialize a TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the corpus and transform the corpus into a DTM
dtm = vectorizer.fit_transform(corpus2)

# Extract the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Initialize a list to hold the feature names for each response
features_per_response = []

# Iterate over each response in the DTM
for response_idx in range(dtm.shape[0]):
    # Find the indices of non-zero features for this response
    non_zero_indices = dtm[response_idx].nonzero()[1]
    # Map these indices to the actual feature names
    response_features = [feature_names[idx] for idx in non_zero_indices]
    # Add the feature names for this response to the list
    features_per_response.append(response_features)

# 'features_per_response' now contains the list of feature names for each response

In [12]:
# Function to train NMF models for a range of topic numbers
def train_nmf_models(doc_term_matrix, n_topics_list):
    nmf_models = {}
    for n_topics in n_topics_list:
        nmf = NMF(n_components=n_topics, random_state=42)
        W = nmf.fit_transform(doc_term_matrix)  # Document-topic matrix
        nmf_models[n_topics] = (nmf, W)
    return nmf_models

# Function to convert NMF topics to a format compatible with Gensim
def nmf_topics_to_gensim_format(nmf_model, feature_names, n_words=10):
    topics = []
    for topic_idx, topic in enumerate(nmf_model.components_):
        top_feature_indices = topic.argsort()[:-n_words - 1:-1]
        topic_words = [feature_names[i] for i in top_feature_indices]
        topics.append(topic_words)
    return topics

# Function to compute coherence scores for NMF models
def compute_coherence_scores(nmf_models, texts, feature_names):
    coherence_scores = {}
    for n_topics, (nmf_model, _) in nmf_models.items():
        topics = nmf_topics_to_gensim_format(nmf_model, feature_names)
        
        # Create a Gensim dictionary
        dictionary = Dictionary(texts)
        # Convert texts to Gensim format corpus
        corpus2 = [dictionary.doc2bow(text) for text in texts]
        
        # Compute Coherence Score using c_v measure
        coherence_model = CoherenceModel(topics=topics, texts=texts, corpus=corpus2, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        coherence_scores[n_topics] = coherence_score
    
    return coherence_scores

# Main script to find the optimal number of topics
if __name__ == "__main__":
    # Your document-term matrix and associated vocabulary
    doc_term_matrix = corpus2
    feature_names = features_per_response

    # Initialize a TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer to the corpus and transform the corpus into a DTM
    doc_term_matrix = vectorizer.fit_transform(corpus2)

    # Extract the feature names (vocabulary) from the vectorizer
    feature_names = vectorizer.get_feature_names_out()

    # Range of topics to evaluate
    n_topics_list = range(2, 21)

    # Train NMF models
    nmf_models = train_nmf_models(doc_term_matrix, n_topics_list)

    # Convert your corpus to a list of lists of words for coherence score calculation
    texts = [doc.split() for doc in corpus2]

    # Compute coherence scores
    coherence_scores = compute_coherence_scores(nmf_models, texts, feature_names)

    # Find the number of topics with the highest coherence score
    optimal_n_topics = max(coherence_scores, key=coherence_scores.get)
    print(f"Optimal number of topics: {optimal_n_topics}, Coherence Score: {coherence_scores[optimal_n_topics]}")



Optimal number of topics: 6, Coherence Score: 0.5252602135433088
