
# Implementation of LDA model

This notebook details the preprocessing steps and an implementation of Latent Dirichlet Allocation (LDA) for Question Answer Clustering. 

In [1]:
# Importing necessary packages

import pandas as pd
from pandas import DataFrame
import os
import io
import re
import numpy as np
import tqdm
from nltk.tokenize.treebank import TreebankWordDetokenizer
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy


os.chdir('..')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Input and Formatting

In [2]:
# Mounting Google Drive to access files

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from google.colab import files
uploaded = files.upload()

Saving Processed_Dataset.csv to Processed_Dataset (1).csv


In [4]:
questions_og = pd.read_csv(io.BytesIO(uploaded['Processed_Dataset.csv']))

In [5]:
# Dataset formatting

questions = questions_og
#questions = questions.drop(columns=['Answer 1', 'Answer 2', 'Answer 3', 'Answer 4', 'Answer 5', 'Author', 'Explanation'], axis=1)
questions = questions.drop(columns=['Q+A', 'Q+E', 'Q+A+E'], axis=1)
questions['Question_Processed'] = questions['Question'].map(lambda x: x.lower())
questions['Question_Processed'] = questions['Question_Processed'].map(lambda x: re.sub('[,\.!?]', '', x))


In [6]:
#Tokenising sentences into individual words with no punctuation
#Note: This process tends to remove any mathematical arguments from the questions

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        #deacc=True removes accent marks

data = questions.Question_Processed.values.tolist()
data_words = list(sent_to_words(data))

#Note: [a:b] is the range of sentences you want printed.  Eg. [1:3] prints sentence 2 and 3
#Remember, Python starts at 0, but [:0] prints no sentence.
#print(data_words[:1])

###### The following code builds the bigram and trigram models. It also describes a faster way of clubbing sentences as a trigram/bigram. You will require lines 1 and 2  to compute the faster models. 

In [7]:
# Building bigram and trigram models

bigram = gensim.models.Phrases(data_words, min_count = 5, threshold = 100)
trigram = gensim.models.Phrases(bigram[data_words], threshold = 100)


bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



#### The following code describes removing stopwords for a given text. This is ran on entire dataset of questions. 

In [8]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

words_keep = ['for', 'if', 'while', 'print', 'Boolean', 'BigO', 'else', 'elseif' ]
new_stop_words = set(stop_words).difference(words_keep)
newer_stop_words = [word for word in stop_words if word not in words_keep] 

#Define functions for stopwords, bigrams, trigrams and lemmatization

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in new_stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','SCONJ','ADP']):
  """https://spacy.io/api/annotation"""
  texts_out = []
  texts_post = []
  for sent in texts:
      doc = nlp(" ".join(sent))
      texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
      #texts_post.append([token.pos for token in doc])
      #print(texts_out)
  return texts_out

#### Removing stopwords using spacy and preprocess the data using lemmatization.

In [9]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
#data_words_trigrams = make_trigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','SCONJ','ADP'])

# Do lemmatization again based off the tri-gram model
#data_lemmatized2 = lemmatization(data_words_trigrams)

#print(data_lemmatized[:1])
#print(data_lemmatized2)

 The process we follow only uses bigrams - we could investigate the use of trigram (or even higher order n-grams) to store a greater 'context' to the questions, however these would further increase the overall computation time required and won't necessarily lead to greater insights (given the typical length of the questions we'll be using)

#### Calculating the frequency of the words obtained in the previous step, and creating it's corpus.

In [10]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])


[[(0, 1), (1, 1), (2, 4), (3, 1), (4, 1)]]


#### The following two bubbles create and view an LDA model for the given dataset. 

In [11]:
num = 60

In [12]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num, random_state=100, chunksize=100, passes=10, per_word_topics=True)

In [13]:
#View the LDA model
#from pprint import pprint

# Print the Keyword in the 10 topics
#pprint(lda_model.print_topics())
#doc_lda = lda_model[corpus]

## Implementing the LDA model implementation after pre-processing.

In [14]:
lda_model2 = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num, random_state=100, chunksize=100, passes=10, alpha=0.31, eta=0.31)

In [15]:
#lda_model3 = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num, random_state=100, chunksize=100, passes=10, alpha=0.31, eta=0.41)
#LDAvis_prepared_param = pyLDAvis.gensim.prepare(lda_model3, corpus, id2word)

In [16]:
# Function to format outputs

def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [17]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model2, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic['Question'] = questions['Question']
#df_dominant_topic.head(10)

In [18]:
df_dominant_topic.sort_values("Dominant_Topic", axis = 0, ascending = True, 
                 inplace = True, na_position ='last') 
#df_dominant_topic.head(20)

In [19]:
df_dominant_topic.to_csv('new_output_test.csv')
!cp new_output_test.csv "/content/drive/My Drive/BSc Project Coding/Data Files"