### <font color='black'>Topic modeling and text classification</font>

<font color='#404040'>In this notebook, we will perform topic modeling and text classification, relying on *gensim*. The main workflow goes as follows: First, we will create *unigram* and *bigram*. Then, we use unigram and bigram to perform topic modeling separately, and interpret the outcome. Finally, features from the topic models serve as an input for text classification to explain the ratings.</font>


In [1]:
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize

import gensim
from gensim import corpora
from gensim.models import CoherenceModel



### <font color='black'>Import data</font>

<font color='#404040'>First, we import data cleaned in the previous notebook with relative paths.</font>

In [2]:
# Read data
dat = pd.read_csv('./data/training_data.csv')

### <font color='black'>Topic modeling with unigram</font>

<font color='#404040'>First, we import data cleaned in the previous notebook with relative paths.</font>

In [3]:
# Create unigram tokens for gensim models
def tokenize_comments(comments):
    comments_tokenized = []

    for comment in comments:
        # If the comment is not missing
        if pd.isnull(comment) == False:
            sentence = []
            
            # Loop through each word in tokenized comment, add to the sentence list
            for word in word_tokenize(comment):
                sentence.append(word) 
            
            # Update
            comments_tokenized.append(sentence)

    return comments_tokenized

In [4]:
def get_corpus(dat_uni):
    # Use cleaned comments to create tokens
    comments_tonkenized = tokenize_comments(dat_uni['reviews_lem'])

    # Create dictionary and corpus - (token_id, token_count)
    dictionary = corpora.Dictionary(comments_tonkenized)
    dictionary.filter_extremes(no_above = 0.8) # Avoid common token
    corpus = [dictionary.doc2bow(token) for token in comments_tonkenized]
    
    # Store corpus, dictionary and tokenized comments in a dict() object
    # Because LdaModel and CoherenceModel takes them as arguments
    return {'corpus': corpus, 'dictionary': dictionary, 'comments_tokenized': comments_tonkenized}

In [5]:
# Create unigram / tokenization for each university
unigram_oxford = get_corpus(dat[dat['University'] == 'oxford'])
unigram_edinburgh = get_corpus(dat[dat['University'] == 'edinburgh'])
unigram_warwick = get_corpus(dat[dat['University'] == 'warwick'])

In [6]:
# Train LDA models, and assess the performance using coherence scores
# Seed is needed because the LDA model changes the results when we rerun the script
SEED = 20210526
unigram_results = []

# Loop through each unversity
for unigram in [unigram_oxford, unigram_edinburgh, unigram_warwick]:
    # Assume there are at most 15 topics, a reasonable upper bound for small-to-medium dataset
    coherence_scores = []
    
    for n_topics in np.arange(1, 15):
        # Instantiate a lda model
        ldamodel = gensim.models.ldamodel.LdaModel(unigram['corpus'], num_topics = n_topics, id2word = unigram['dictionary'], 
                                                   iterations = 1000, random_state = SEED)
        
        # Instantiate a coherence model and calculate the coherence scores
        ldamodel_coherence = CoherenceModel(model = ldamodel, texts = unigram['comments_tokenized'], dictionary = unigram['dictionary'], 
                                            coherence ='c_v')
        
        # Update
        coherence_scores.append(ldamodel_coherence.get_coherence())
    
    # Update
    unigram_results.append(coherence_scores)