# TBA 3102 - Text Analytics
## Practical Lab 09 - Text Summarization and Topic Models (II)
### Question 1 - Topic Modeling
Student: Nicky Ng <br>
GitHub User: [ahjimomo](https://github.com/ahjimomo) <br>
Student Number: A0194330L

### Libraries

In [None]:
# Data Wrangling
import numpy as np
import pandas as pd

# Topic Modeling
import nltk
import gensim

# Tokenizer & Feature Engineering
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
stop_words = nltk.corpus.stopwords.words('english')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

# Gemsim models
from gensim import corpora, models
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.lsimodel import LsiModel

# Parameters Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid

# Display DF
from IPython.core.display import HTML
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Import dataset

In [None]:
# Import cleaned dataset
raw_df = pd.read_csv('./data/voted-kaggled-dataset-cleaned.csv')
raw_df.info()

### Feature Engineering to prepare features

In [None]:
def normalize_corpus(descriptions):
    
    norm_description = []
    
    for description in descriptions:
        
        desc_tokens = [token.strip() for token in wtk.tokenize(description)]    
        desc_tokens = [wnl.lemmatize(token) for token in desc_tokens if not token.isnumeric()]
        desc_tokens = [token for token in desc_tokens if len(token) > 1]
        desc_tokens = [token for token in desc_tokens if token not in stop_words]
        desc_tokens = list(filter(None, desc_tokens))
        
        norm_description.append(desc_tokens)
    
    return norm_description

In [None]:
processed_desc = list(raw_df['Cleaned_Description'])
norm_desc = normalize_corpus(processed_desc)
raw_df['normalized_description'] = norm_desc

In [None]:
raw_df.info()

In [None]:
# Feature Cleaning
#bigram = gensim.models.Phrases(raw_df['normalized_description'], min_count = 20, threshold = 20, delimiter  = '_')
#bigram_model = gensim.models.phrases.Phraser(bigram)

# Creating both unigram & bigram
#norm_corpus = []
#for doc in raw_df['normalized_description']:
#    bigram_doc = bigram_model[doc]
#    norm_corpus.append(bigram_doc)
#print(bigram_doc)

# Create a dictionary representation of the document of unigram
dictionary = gensim.corpora.Dictionary(norm_desc)

# Filter out words that occur less than 20 documents, or more than 60% of the documents.
dictionary.filter_extremes(no_below = 20, no_above = 0.6)

# Transforming corpus into bag of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus]

### fine-tuning & selecting optimal algorithm and model
* Latent Semantic Indexing: [LSI Parameters] (https://radimrehurek.com/gensim/models/lsimodel.html)
* Latent Dirichlet Allocation: [LDA parameters] (https://radimrehurek.com/gensim/models/ldamodel.html)

In [None]:
def finetuning_model(corpus, dictionary, algor, df):
    # Initialize dataframe and list to store results
    results = pd.DataFrame()
    coherence_cv_scores = []
    coherence_UMass_scores = []
    parameters = []
    algor_lst = []
    
    # Parameters
    topic_lst = [5, 6, 7, 8, 9, 10]
    random_seed = [42]
    
    # Compute hyperparameter grid
    if algor == 'LSI':
        hyperparams = {'num_topics': topic_lst, 'random_seed': random_seed, 'power_iters': [10, 50, 100]}
    elif algor == 'LDA':
        hyperparams = {'num_topics': topic_lst, 'random_state': random_seed, 'alpha': ['symmetric', 'auto'], 'passes': [1, 10], 'iterations': [50, 200]}
    paramgrid = list(ParameterGrid(hyperparams))
    
    # Loop over parameter grid for LSI
    count = 0
    for params in paramgrid:
        if algor == 'LSI':
            model = LsiModel(corpus = corpus, id2word = dictionary, **params)
        elif algor == 'LDA':
            model = LdaModel(corpus = corpus, id2word = dictionary, **params)
        
        # Compute coherence score
        umass_model = CoherenceModel(model = model, corpus = corpus, dictionary = dictionary, coherence = 'u_mass')
        umass_score = umass_model.get_coherence()
        cv_model = CoherenceModel(model = model, texts = df['normalized_description'], dictionary = dictionary, coherence = 'c_v')
        cv_score = cv_model.get_coherence()
        
        # Store the results
        algor_lst.append(f'{algor}_{count}')
        parameters.append(params)
        coherence_cv_scores.append(cv_score)
        coherence_UMass_scores.append(umass_score)
        
        count += 1
        
    # Append result to result dataframe and return dataframe
    results['algorithm'] = algor_lst
    results['cv_score'] = coherence_cv_scores
    results['umass_score'] = coherence_UMass_scores
    results['parameters'] = parameters
    
    return results

In [None]:
# Perform Topic Modeling with LSI
LSI_results = finetuning_model(bow_corpus, dictionary, 'LSI', raw_df)
LSI_results

In [None]:
# Perform Topic Modeling with LDA
LDA_results = finetuning_model(bow_corpus, dictionary, 'LDA', raw_df)
LDA_results

In [None]:
# Concat results together
full_results = pd.concat([LSI_results, LDA_results])

In [None]:
# Show results
full_results

In [None]:
# Rearrange results
results = full_results.sort_values('umass_score', ascending = True).sort_values('cv_score', ascending = False)
results

Using perplexity and coherence scores as measures to evaluate the topic model, the model would be better if
- Lower the UMass score
- Higher the Cv score

In [None]:
results.iloc[0]

In [None]:
final_df = raw_df.copy()

In [None]:
# 4c. Determine the most dominant topic for each document using the best model
best_model = LdaModel(corpus = bow_corpus, id2word = dictionary, alpha = 'symmetric',
                      iterations = 50, num_topics = 5, passes = 10,
                      random_state = 42)

# Compute dominant topics for each document
topic_weights = []
for row in best_model[bow_corpus]:
    topic_weights.append(dict(row))
    
topic_weights_df = pd.DataFrame(topic_weights)

# List to store topic and keywords
dominant_topics = []
topic_keywords = []

for i, row in topic_weights_df.iterrows():
    sorted_topics = sorted(row.items(), key = lambda x: x[1], reverse = True)
    top_topic = f"Topic {sorted_topics[0][0]} ({sorted_topics[0][1]:.3f})"
    dominant_topics.append(top_topic)
    
    # Get top keywords for each topic
    keywords = [word for (word, prob) in best_model.show_topic(sorted_topics[0][0], topn=10)]
    topic_keywords.append(keywords)
    
# Append topic back to dataframe
final_df['Dominant_Topic'] = dominant_topics
final_df['Topic_Keywords'] = topic_keywords

In [None]:
final_df[['Cleaned_Description', 'Dominant_Topic', 'Topic_Keywords']].head()

In [None]:
final_df.to_csv('./data/corupus_topic_best.csv')