# Guided LDA
https://www.kaggle.com/code/nvpsani/topic-modelling-using-guided-lda

In [9]:
import numpy as np
import s3fs
import boto3
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer

import guidedlda

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
#import data
model_data = pd.read_csv('s3://book-data-ucb-capstone-s2022/LDA_train.csv')
print('Tokenized Text DF Size:', len(model_data))

Tokenized Text DF Size: 28652


  model_data = pd.read_csv('s3://book-data-ucb-capstone-s2022/LDA_train.csv')


In [27]:
model_data.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,rating,description,language,...,Fantasy,Erotica,History,Dystopia,Poetry,Biography,Manga,Thriller,Graphic Novels,Romance
0,1000,1000,28797,274618.The_Complete_Stories_of_Theodore_Sturge...,"The Complete Stories of Theodore Sturgeon, Vol...",The Complete Stories of Theodore Sturgeon #2,"Theodore Sturgeon, Paul Williams (Editor), Sam...",4.21,The second of a planned 10 volumes that will r...,English,...,Fantasy,,,,,,,,,
1,1001,1001,6612,14367051-city-of-bones-city-of-ashes-city-of-g...,City of Bones / City of Ashes / City of Glass ...,The Mortal Instruments #1-5,Cassandra Clare (Goodreads Author),4.58,The first five books in the #1 New York Times ...,English,...,Fantasy,,,,,,,,,Romance
2,1002,1002,18812,35156.Clash_of_the_Titans,Clash of the Titans,,Alan Dean Foster,3.88,"PLAYTHING OF THE GODSHe was Perseus, son of Ze...",English,...,Fantasy,,,,,,,,,
3,1003,1003,4134,45791.The_Ballad_of_the_Sad_Caf_and_Other_Stories,The Ballad of the Sad Café and Other Stories,,Carson McCullers,4.0,A classic work that has charmed generations of...,English,...,,,,,,,,,,
4,1004,1004,4052,20603820-the-will,The Will,Magdalene #1,Kristen Ashley (Goodreads Author),4.29,"Early in her life, Josephine Malone learned th...",English,...,,,,,,,,,,Romance


In [28]:
#preprocessing
def preprocess_text(text):

  #lowercase text
  text_preprocessed = text.lower()
  #remove punctuation
  text_preprocessed = re.sub(r'[^a-zA-Z ]+', '', text_preprocessed)
  #tokenize for stopword removal
  text_preprocessed = word_tokenize(text_preprocessed)
  #remove stopwords
  text_preprocessed = [word for word in text_preprocessed if word not in stopwords.words('english')]
  #join to make string again
  #text_preprocessed = (" ").join(text_preprocessed)

  return text_preprocessed

In [29]:
%%time
model_data['tokens'] = model_data['description'].apply(lambda x: preprocess_text(x))

CPU times: user 5min 4s, sys: 21.8 s, total: 5min 26s
Wall time: 5min 26s


In [30]:
#sample model data to 1000 rows
model_data_samp = model_data.sample(n=1000)

## Synsets

In [31]:
def create_synsets(event):
  
  synonym = [] 
    
  for synset in wordnet.synsets(event): 
      for i in synset.lemmas(): 
          synonym.append(i.name()) # add all the synonyms available 
    
  return synonym

In [32]:
#creating library dataframe

#creating library dataframe


life_events = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 
               'death', 'family', 'friendship', 'marriage']

#create synsets for select events where decent synsets exist
relationship_list = create_synsets('go_steady') + ['relationship', 'kinship', 'romance', 'dating']
marriage_list = create_synsets('marriage')
wedding_list = create_synsets('wedding') + ['matrimony']

#replace underscore (_) with space
relationship_list = [i.replace("_", " ") for i in relationship_list]
marriage_list = [i.replace("_", " ") for i in marriage_list]
wedding_list = [i.replace("_", " ") for i in wedding_list]

#remove certain words
wedding_list.remove('tie')
wedding_list.remove('marriage')
relationship_list.remove('see')

synsets = [['college', 'university', 'campus', 'academia', 'professor', 'colleges', 'universities', 'professors'], 
           relationship_list, 
           ['breakup', 'break up', 'split up', 'broken up', 'dumped', 'breaks up', 'splits up', 'dumps', 'dump', 'breaks off', 'break off'], 
           ['divorce', 'divorced', 'divorces'], 
           wedding_list,  
           ['death', 'decease', 'deceased', 'dying'],
           ['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad'],
           ['friends', 'friend', 'friendship', 'friendships'],
           marriage_list]

# Create the pandas DataFrame with column name is provided explicitly
df_lib = pd.DataFrame(life_events, columns=['life_event'])
df_lib['synsets'] = synsets
 
# print dataframe.
df_lib

Unnamed: 0,life_event,synsets
0,university,"[college, university, campus, academia, profes..."
1,relationships,"[go steady, go out, date, relationship, kinshi..."
2,break ups,"[breakup, break up, split up, broken up, dumpe..."
3,divorce,"[divorce, divorced, divorces]"
4,wedding,"[wedding, wedding ceremony, nuptials, hymeneal..."
5,death,"[death, decease, deceased, dying]"
6,family,"[family, mother, father, brother, sister, mom,..."
7,friendship,"[friends, friend, friendship, friendships]"
8,marriage,"[marriage, matrimony, union, spousal relations..."


## Guided LDA

In [33]:
#create objects required for model training

model_data_samp['liststring'] = [','.join(map(str, l)) for l in model_data_samp['tokens']]
corpus=model_data_samp['liststring'].tolist()
vocab=list(set(word_tokenize(" ".join(model_data_samp['liststring']))))
vectorizer = CountVectorizer(ngram_range=(1,2),vocabulary=vocab)
X = vectorizer.fit_transform(corpus)
word2id=vectorizer.vocabulary_

#synsets unigrams only

synsets = [['college', 'university', 'campus', 'academia', 'professor', 'colleges', 'universities', 'professors'], 
           ['date', 'relationship', 'kinship', 'romance', 'dating'],
           ['breakup', 'dumped', 'dumps', 'dump'], 
           ['divorce', 'divorced', 'divorces'], 
           ['wedding', 'nuptials', 'wedding', 'wedding', 'marry', 'wed', 'marry', 'wed', 'matrimony'], 
           ['death', 'deceased', 'dying'],
           ['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad'],
           ['friends', 'friend', 'friendship', 'friendships']]


seed_topics = {}
for t_id, st in enumerate(synsets):
    for word in st:
        seed_topics[word2id[word]] = t_id

In [34]:
corpus=model_data_samp['liststring'].tolist()

In [35]:
vocab=list(set(word_tokenize(" ".join(model_data_samp['liststring']))))

In [36]:
vectorizer = CountVectorizer(ngram_range=(1,2),vocabulary=vocab)

In [37]:
X = vectorizer.fit_transform(corpus)

In [38]:
word2id=vectorizer.vocabulary_

In [39]:
#define model
model = guidedlda.GuidedLDA(n_topics=8, n_iter=2000, random_state=7, refresh=20,alpha=0.01,eta=0.01)

In [47]:
#synsets for sampled data

synsets = [['college', 'university', 'campus', 'professor', 'professors'], 
           ['date', 'relationship', 'romance', 'dating'],
           ['breakup', 'dumped'], 
           ['divorce', 'divorced'], 
           ['wedding', 'nuptials', 'wedding', 'wedding', 'marry', 'wed', 'marry', 'wed'], 
           ['death', 'deceased', 'dying'],
           ['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad'],
           ['friends', 'friend', 'friendship', 'friendships']]

## Set Priors

In [48]:
seed_topics = {}
for t_id, st in enumerate(synsets):
    for word in st:
        seed_topics[word2id[word]] = t_id

In [49]:
seed_topics

{12544: 0,
 8515: 0,
 660: 0,
 3107: 0,
 12667: 0,
 13904: 1,
 10113: 1,
 7720: 1,
 5427: 1,
 6133: 2,
 3306: 2,
 16081: 3,
 16241: 3,
 5122: 4,
 18704: 4,
 8747: 4,
 15908: 4,
 16515: 5,
 3726: 5,
 1390: 5,
 7310: 6,
 17558: 6,
 15480: 6,
 2976: 6,
 18578: 6,
 1067: 6,
 6970: 6,
 4989: 7,
 15029: 7,
 174: 7,
 10276: 7}

## Train Model

In [50]:
%%time
model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)

INFO:guidedlda:n_documents: 1000
INFO:guidedlda:vocab_size: 19010
INFO:guidedlda:n_words: 78073
INFO:guidedlda:n_topics: 8
INFO:guidedlda:n_iter: 2000
INFO:guidedlda:<0> log likelihood: -991195
INFO:guidedlda:<20> log likelihood: -801638
INFO:guidedlda:<40> log likelihood: -792382
INFO:guidedlda:<60> log likelihood: -786931
INFO:guidedlda:<80> log likelihood: -783421
INFO:guidedlda:<100> log likelihood: -780120
INFO:guidedlda:<120> log likelihood: -778403
INFO:guidedlda:<140> log likelihood: -775965
INFO:guidedlda:<160> log likelihood: -774511
INFO:guidedlda:<180> log likelihood: -773430
INFO:guidedlda:<200> log likelihood: -771788
INFO:guidedlda:<220> log likelihood: -771259
INFO:guidedlda:<240> log likelihood: -770549
INFO:guidedlda:<260> log likelihood: -770018
INFO:guidedlda:<280> log likelihood: -769545
INFO:guidedlda:<300> log likelihood: -768748
INFO:guidedlda:<320> log likelihood: -768408
INFO:guidedlda:<340> log likelihood: -768074
INFO:guidedlda:<360> log likelihood: -767553


CPU times: user 13.3 s, sys: 30.9 ms, total: 13.3 s
Wall time: 13.3 s


<guidedlda.guidedlda.GuidedLDA at 0x7f8646f6aee0>

## Seeing the model output topics and up to 10 words per topic

In [51]:
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: magic king must find powerful magical princess two lost land
Topic 1: school de jane mr like new class best friend high
Topic 2: one life new love family shes even world never man
Topic 3: death killer murder case dead back must crime police american
Topic 4: story short ray bradbury summer lucky used die alice says
Topic 5: world war power time must ancient battle dark death city
Topic 6: new series book author bestselling york readers novel first stories
Topic 7: novel story love world two great book first life years


## Assigning the Topics

In [52]:
topic_num_name = {"Topic 0":"university",
                  "Topic 1":"relationships",
                  "Topic 2":"breakups",
                  "Topic 3":"divorce",
                  "Topic 4":"wedding",
                  "Topic 5": "death",
                  "Topic 6": "family",
                  "Topic 7": "friends"}    

In [53]:
def get_doc_topics(model_glda,X,num_topics,dataframe,col_name):
    """
    A function which creates dataframe with documents, their dominant topic, along with their probabilities
    
    Parameters
    -------------
    model_glda - Guided LDA trained model
    X - Document term frequency table
    num_topics - Number of topics the model was trained for
    dataframe - Dataframe consisting of cleaned text column
    col_name - Column name in dataframe holding cleaned text
    
    Returns
    -------------
    A dataframe with document number, topic, probability of topic
    """
    df_doc_top = pd.DataFrame()
    final_list = []
    for index in range(len(dataframe[col_name])):
        word_id_dict = dict((x,y) for x,y in zip([x for x in range(num_topics)],np.round(model.transform(X[index])*100,1).tolist()[0]))
        word_score_list = []
        for index in range(num_topics):
            try:
                value = word_id_dict[index]
            except:
                value = 0
            word_score_list.append(value)
        final_list.append(word_score_list)

    df_doc_top = pd.DataFrame(final_list)
    df_doc_top.columns = ['Topic ' + str(i) for i in range(num_topics)]
    df_doc_top.index = ['Document ' + str(i) for i in range(len(dataframe[col_name]))]

    df_doc_top["Dominant_Topic"] = df_doc_top.idxmax(axis=1).tolist()
    df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
    #document_df = df_doc_top.reset_index().rename(columns={"index":"Document"})[["Document","Dominant_Topic","Topic_Probability"]]


    #return document_df
    return df_doc_top

In [33]:
X.shape

(1000, 18637)

In [54]:
%%time

#ignore warning
import warnings
warnings.filterwarnings("ignore")

document_df=get_doc_topics(model,X,8,model_data_samp,"liststring")



CPU times: user 2.58 s, sys: 357 ms, total: 2.94 s
Wall time: 2.45 s


In [56]:
document_df['description']= model_data['description']

In [55]:
#output in dictionary format for each row in df
    #key: topic; value: prob
document_df

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Dominant_Topic,Topic_Probability
Document 0,20.6,0.0,22.2,0.0,0.0,0.0,0.0,57.2,Topic 7,57.2
Document 1,0.0,6.5,42.3,34.7,0.0,7.2,0.0,9.3,Topic 2,42.3
Document 2,0.0,9.2,32.3,0.0,0.0,14.8,0.0,43.7,Topic 7,43.7
Document 3,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,Topic 2,100.0
Document 4,0.1,0.0,28.0,0.0,0.0,18.2,22.5,31.2,Topic 7,31.2
...,...,...,...,...,...,...,...,...,...,...
Document 995,0.0,0.0,84.2,0.0,0.0,15.8,0.0,0.0,Topic 2,84.2
Document 996,0.0,0.0,67.6,0.0,0.0,0.0,16.5,15.9,Topic 2,67.6
Document 997,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,Topic 2,100.0
Document 998,0.1,0.0,75.2,4.4,0.0,0.1,0.9,19.3,Topic 2,75.2


In [56]:
document_df.Dominant_Topic.value_counts()

Topic 2    645
Topic 7    106
Topic 5     68
Topic 6     63
Topic 3     38
Topic 0     37
Topic 1     25
Topic 4     18
Name: Dominant_Topic, dtype: int64

## Run the Test Set

In [57]:
#import test set

#import data
model_data_test = pd.read_csv('s3://book-data-ucb-capstone-s2022/LDA_test.csv')
print('Tokenized Text DF Size:', len(model_data_test))

Tokenized Text DF Size: 1000


In [58]:
%%time
model_data_test['tokens'] = model_data_test['description'].apply(lambda x: preprocess_text(x))

CPU times: user 10.6 s, sys: 752 ms, total: 11.3 s
Wall time: 11.3 s


In [65]:
#create objects required for model training

model_data_test['liststring'] = [','.join(map(str, l)) for l in model_data_test['tokens']]
corpus=model_data_test['liststring'].tolist()
vocab=list(set(word_tokenize(" ".join(model_data_test['liststring']))))
vectorizer = CountVectorizer(ngram_range=(1,2),vocabulary=vocab)
X = vectorizer.fit_transform(corpus)
word2id=vectorizer.vocabulary_


In [64]:
#synsets unigrams only

synsets = [['college', 'university', 'campus', 'professor', 'colleges', 'professors'], 
           ['date', 'relationship', 'romance', 'dating'],
           ['breakup', 'dumped', 'dump'], 
           ['divorce', 'divorced', 'divorces'], 
           ['wedding', 'nuptials', 'wedding', 'wedding', 'marry', 'wed', 'marry', 'wed', 'matrimony'], 
           ['death', 'deceased', 'dying'],
           ['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad'],
           ['friends', 'friend', 'friendship', 'friendships']]


seed_topics = {}
for t_id, st in enumerate(synsets):
    for word in st:
        seed_topics[word2id[word]] = t_id

In [68]:
topic_list = ['universities', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']


In [66]:
document_df_test=get_doc_topics(model,X,8,model_data_test,"liststring")



In [67]:
document_df_test

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Dominant_Topic,Topic_Probability
Document 0,5.2,15.0,19.3,17.7,8.4,13.5,2.4,18.6,Topic 2,19.3
Document 1,3.5,8.6,18.2,3.5,11.7,7.4,21.5,25.6,Topic 7,25.6
Document 2,8.3,12.0,20.4,7.7,8.6,15.7,7.8,19.4,Topic 2,20.4
Document 3,7.1,10.7,27.7,8.9,10.4,15.1,8.7,11.4,Topic 2,27.7
Document 4,5.8,9.0,22.2,11.7,7.9,11.5,17.7,14.1,Topic 2,22.2
...,...,...,...,...,...,...,...,...,...,...
Document 995,10.6,3.5,14.3,18.8,12.1,12.0,10.7,17.9,Topic 3,18.8
Document 996,7.8,6.1,18.1,11.2,18.8,6.1,20.3,11.6,Topic 6,20.3
Document 997,7.6,11.0,22.9,0.0,22.0,23.6,4.4,8.5,Topic 5,23.6
Document 998,13.8,6.2,17.2,16.9,15.6,15.1,8.7,6.5,Topic 2,17.2


In [71]:
topic_list = ['universities', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship', 'Dominant_Topic', 'Topic_Probability']


In [72]:
document_df_test.columns = topic_list

In [73]:
document_df_test.head()

Unnamed: 0,universities,relationships,break ups,divorce,wedding,death,family,friendship,Dominant_Topic,Topic_Probability
Document 0,5.2,15.0,19.3,17.7,8.4,13.5,2.4,18.6,Topic 2,19.3
Document 1,3.5,8.6,18.2,3.5,11.7,7.4,21.5,25.6,Topic 7,25.6
Document 2,8.3,12.0,20.4,7.7,8.6,15.7,7.8,19.4,Topic 2,20.4
Document 3,7.1,10.7,27.7,8.9,10.4,15.1,8.7,11.4,Topic 2,27.7
Document 4,5.8,9.0,22.2,11.7,7.9,11.5,17.7,14.1,Topic 2,22.2


### Potential Optimizations
* modify synsets
* include bigrams
* seed_confidence
* n_iter