# Guided LDA
https://www.kaggle.com/code/nvpsani/topic-modelling-using-guided-lda

In [1]:
import numpy as np
import s3fs
import boto3
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

from lda import guidedlda

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#import data
model_data = pd.read_csv('LDA_train.csv')
print('Tokenized Text DF Size:', len(model_data))

Tokenized Text DF Size: 28652


  model_data = pd.read_csv('LDA_train.csv')


In [3]:
#sample model data
model_data = model_data.sample(frac=0.25, random_state=42)

In [4]:
model_data.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,rating,description,language,...,Fantasy,Erotica,History,Dystopia,Poetry,Biography,Manga,Thriller,Graphic Novels,Romance
17885,18885,18885,5956,2272813.Ninth_Grade_Slays,Ninth Grade Slays,The Chronicles of Vladimir Tod #2,"Heather Brewer (Goodreads Author), Z Brewer",4.14,High school totally bites when you’re half hum...,English,...,Fantasy,,,,,,,,,Romance
10186,11186,11186,47684,659546.Promise_Not_to_Tell,Promise Not to Tell,,"Jennifer McMahon (Goodreads Author), Kathleen ...",3.64,Forty-one-year-old school nurse Kate Cypher ha...,English,...,,,,,,,,Thriller,,
17757,18757,18757,27861,248884.Berserk_Vol_6,"Berserk, Vol. 6",ベルセルク / Berserk #6,"Kentaro Miura, Duane Johnson (Translator)",4.56,"Back in the day, Guts the Black Swordsman was ...",English,...,Fantasy,,,,,,Manga,,Graphic Novels,
15936,16936,16936,49756,14740588-notorious-nineteen,Notorious Nineteen,Stephanie Plum #19,Janet Evanovich (Goodreads Author),3.94,New Jersey bounty hunter Stephanie Plum is cer...,English,...,,,,,,,,,,Romance
12340,13340,13340,37392,650584.Shadowdale,Shadowdale,Forgotten Realms: Avatar #1,"Scott Ciencin, Richard Awlinson (Avatar Projec...",3.71,The gods walk the Realms.Banished from the hea...,English,...,Fantasy,,,,,,,,,


In [5]:
#preprocessing
def preprocess_text(text):

  #lowercase text
  text_preprocessed = text.lower()
  #remove punctuation
  text_preprocessed = re.sub(r'[^a-zA-Z ]+', '', text_preprocessed)
  #tokenize for stopword removal
  text_preprocessed = word_tokenize(text_preprocessed)
  #remove stopwords
  text_preprocessed = [word for word in text_preprocessed if word not in stopwords.words('english')]
  #join to make string again
  #text_preprocessed = (" ").join(text_preprocessed)

  return text_preprocessed

In [6]:
%%time
model_data['tokens'] = model_data['description'].apply(lambda x: preprocess_text(x))

CPU times: user 1min 13s, sys: 6.02 s, total: 1min 19s
Wall time: 1min 19s


## Synsets

In [7]:
def create_synsets(event):
  
  synonym = [] 
    
  for synset in wordnet.synsets(event): 
      for i in synset.lemmas(): 
          synonym.append(i.name()) # add all the synonyms available 
    
  return synonym

In [8]:
#creating library dataframe

#creating library dataframe


life_events = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 
               'death', 'family', 'friendship', 'marriage']

#create synsets for select events where decent synsets exist
relationship_list = create_synsets('go_steady') + ['relationship', 'kinship', 'romance', 'dating']
marriage_list = create_synsets('marriage')
wedding_list = create_synsets('wedding') + ['matrimony']

#replace underscore (_) with space
relationship_list = [i.replace("_", " ") for i in relationship_list]
marriage_list = [i.replace("_", " ") for i in marriage_list]
wedding_list = [i.replace("_", " ") for i in wedding_list]

#remove certain words
wedding_list.remove('tie')
wedding_list.remove('marriage')
relationship_list.remove('see')

synsets = [['college', 'university', 'campus', 'academia', 'professor', 'colleges', 'universities', 'professors'], 
           relationship_list, 
           ['breakup', 'break up', 'split up', 'broken up', 'dumped', 'breaks up', 'splits up', 'dumps', 'dump', 'breaks off', 'break off'], 
           ['divorce', 'divorced', 'divorces'], 
           wedding_list,  
           ['death', 'decease', 'deceased', 'dying'],
           ['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad'],
           ['friends', 'friend', 'friendship', 'friendships'],
           marriage_list]

# Create the pandas DataFrame with column name is provided explicitly
df_lib = pd.DataFrame(life_events, columns=['life_event'])
df_lib['synsets'] = synsets
 
# print dataframe.
df_lib

Unnamed: 0,life_event,synsets
0,university,"[college, university, campus, academia, profes..."
1,relationships,"[go steady, go out, date, relationship, kinshi..."
2,break ups,"[breakup, break up, split up, broken up, dumpe..."
3,divorce,"[divorce, divorced, divorces]"
4,wedding,"[wedding, wedding ceremony, nuptials, hymeneal..."
5,death,"[death, decease, deceased, dying]"
6,family,"[family, mother, father, brother, sister, mom,..."
7,friendship,"[friends, friend, friendship, friendships]"
8,marriage,"[marriage, matrimony, union, spousal relations..."


## Guided LDA

In [9]:
#create objects required for model training

model_data['liststring'] = [','.join(map(str, l)) for l in model_data['tokens']]
corpus=model_data['liststring'].tolist()
vocab=list(set(word_tokenize(" ".join(model_data['liststring']))))
vectorizer = CountVectorizer(ngram_range=(1,2),vocabulary=vocab)
X = vectorizer.fit_transform(corpus)
word2id=vectorizer.vocabulary_

#synsets unigrams only

#define model
model = guidedlda.GuidedLDA(n_topics=8, n_iter=2000, random_state=7, refresh=20,alpha=0.01,eta=0.01)

synsets = [['college', 'university', 'campus', 'professor', 'professors'], 
           ['date', 'relationship', 'romance', 'dating'],
           ['breakup', 'dumped'], 
           ['divorce', 'divorced'], 
           ['wedding', 'nuptials', 'wedding', 'wedding', 'marry', 'wed', 'marry', 'wed'], 
           ['death', 'deceased', 'dying'],
           ['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad'],
           ['friends', 'friend', 'friendship', 'friendships']]

seed_topics = {}
for t_id, st in enumerate(synsets):
    for word in st:
        seed_topics[word2id[word]] = t_id

In [10]:
topic_num_name = {"Topic 0":"university",
                  "Topic 1":"relationships",
                  "Topic 2":"breakups",
                  "Topic 3":"divorce",
                  "Topic 4":"wedding",
                  "Topic 5": "death",
                  "Topic 6": "family",
                  "Topic 7": "friends"}    

In [11]:
topic_list = ['universities', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']

In [12]:
def get_doc_topics(model_glda,X,num_topics,dataframe,col_name):
    """
    A function which creates dataframe with documents, their dominant topic, along with their probabilities
    
    Parameters
    -------------
    model_glda - Guided LDA trained model
    X - Document term frequency table
    num_topics - Number of topics the model was trained for
    dataframe - Dataframe consisting of cleaned text column
    col_name - Column name in dataframe holding cleaned text
    
    Returns
    -------------
    A dataframe with document number, topic, probability of topic
    """
    df_doc_top = pd.DataFrame()
    final_list = []
    for index in range(len(dataframe[col_name])):
        word_id_dict = dict((x,y) for x,y in zip([x for x in range(num_topics)],np.round(model.transform(X[index])*100,1).tolist()[0]))
        word_score_list = []
        for index in range(num_topics):
            try:
                value = word_id_dict[index]
            except:
                value = 0
            word_score_list.append(value)
        final_list.append(word_score_list)

    df_doc_top = pd.DataFrame(final_list)
    df_doc_top.columns = ['Topic ' + str(i) for i in range(num_topics)]
    df_doc_top.index = ['Document ' + str(i) for i in range(len(dataframe[col_name]))]

    df_doc_top["Dominant_Topic"] = df_doc_top.idxmax(axis=1).tolist()
    df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
    #document_df = df_doc_top.reset_index().rename(columns={"index":"Document"})[["Document","Dominant_Topic","Topic_Probability"]]


    #return document_df
    return df_doc_top

In [13]:
model_data_test = pd.read_csv('LDA_test.csv')
model_data_test = model_data_test.replace(np.nan, False)

In [14]:
model_data_test['tokens'] = model_data_test['description'].apply(lambda x: preprocess_text(x))

In [15]:
#create test objects
#model_data_test = pd.read_csv('s3://book-data-ucb-capstone-s2022/LDA_test.csv')

#preprocess test set
#model_data_test['tokens'] = model_data_test['description'].apply(lambda x: preprocess_text(x))

#create objects required for model testing

model_data_test['liststring'] = [','.join(map(str, l)) for l in model_data_test['tokens']]
corpus=model_data_test['liststring'].tolist()
vocab=list(set(word_tokenize(" ".join(model_data_test['liststring']))))
vectorizer = CountVectorizer(ngram_range=(1,1),vocabulary=vocab)
X = vectorizer.fit_transform(corpus)
word2id=vectorizer.vocabulary_

## Train Model

In [16]:
%%time

alphas = [0.01, 0.1, 1]
seed_confidences = [0.05, 0.25, 0.5]

for alpha in alphas:
    for seed_confidence in seed_confidences:
        
        print('Alpha:', str(alpha), '& Seed Confidence', str(seed_confidence))
        
        #define model
        model = guidedlda.GuidedLDA(n_topics=8, n_iter=2000, random_state=7, refresh=20,alpha=alpha,eta=0.01)
        
        #fit model
        print('fitting model')
        model.fit(X, seed_topics=seed_topics, seed_confidence=seed_confidence)
        print('model fitted')
        print('')
        
        #get topics
        print('getting topics')
        document_df_test=get_doc_topics(model,X,8,model_data_test,"liststring")
        topic_list = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship', 'Dominant_Topic', 'Topic_Probability']
        document_df_test.columns = topic_list
        print('got topics')
        
        #convert preds to 
        preds = document_df_test[['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']]
        cols = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']

        #turn preds into True/False
        for col_name in cols:
            preds[col_name] =  preds[col_name] > 20
        
        #save both document_df_test and preds to csvs
        probs_csv_name = 'glda_probs_alpha_' + str(alpha) + '_seedconf_' + str(seed_confidence) + '.csv'
        bool_csv_name = 'glda_bool_alpha_' + str(alpha) + '_seedconf_' + str(seed_confidence) + '.csv'
        
        document_df_test.to_csv(probs_csv_name)
        preds.to_csv(bool_csv_name)



INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


Alpha: 0.01 & Seed Confidence 0.05
fitting model


INFO:lda:<0> log likelihood: -983630
INFO:lda:<20> log likelihood: -793124
INFO:lda:<40> log likelihood: -784830
INFO:lda:<60> log likelihood: -779984
INFO:lda:<80> log likelihood: -776822
INFO:lda:<100> log likelihood: -774058
INFO:lda:<120> log likelihood: -772566
INFO:lda:<140> log likelihood: -771014
INFO:lda:<160> log likelihood: -769468
INFO:lda:<180> log likelihood: -767916
INFO:lda:<200> log likelihood: -766540
INFO:lda:<220> log likelihood: -765032
INFO:lda:<240> log likelihood: -763713
INFO:lda:<260> log likelihood: -762564
INFO:lda:<280> log likelihood: -761618
INFO:lda:<300> log likelihood: -761456
INFO:lda:<320> log likelihood: -759926
INFO:lda:<340> log likelihood: -759209
INFO:lda:<360> log likelihood: -758253
INFO:lda:<380> log likelihood: -756858
INFO:lda:<400> log likelihood: -756167
INFO:lda:<420> log likelihood: -755659
INFO:lda:<440> log likelihood: -754816
INFO:lda:<460> log likelihood: -753888
INFO:lda:<480> log likelihood: -753484
INFO:lda:<500> log likelihood: 

model fitted

getting topics


  df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
Alpha: 0.01 & Seed Confidence 0.25
fitting model


INFO:lda:<0> log likelihood: -983613
INFO:lda:<20> log likelihood: -793083
INFO:lda:<40> log likelihood: -784301
INFO:lda:<60> log likelihood: -779734
INFO:lda:<80> log likelihood: -776307
INFO:lda:<100> log likelihood: -773355
INFO:lda:<120> log likelihood: -771804
INFO:lda:<140> log likelihood: -770199
INFO:lda:<160> log likelihood: -769030
INFO:lda:<180> log likelihood: -767781
INFO:lda:<200> log likelihood: -766391
INFO:lda:<220> log likelihood: -765488
INFO:lda:<240> log likelihood: -764374
INFO:lda:<260> log likelihood: -763563
INFO:lda:<280> log likelihood: -762474
INFO:lda:<300> log likelihood: -761833
INFO:lda:<320> log likelihood: -761443
INFO:lda:<340> log likelihood: -760695
INFO:lda:<360> log likelihood: -759661
INFO:lda:<380> log likelihood: -759271
INFO:lda:<400> log likelihood: -758485
INFO:lda:<420> log likelihood: -757859
INFO:lda:<440> log likelihood: -756664
INFO:lda:<460> log likelihood: -756445
INFO:lda:<480> log likelihood: -755926
INFO:lda:<500> log likelihood: 

model fitted

getting topics


  df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
Alpha: 0.01 & Seed Confidence 0.5
fitting model


INFO:lda:<0> log likelihood: -983587
INFO:lda:<20> log likelihood: -793206
INFO:lda:<40> log likelihood: -784416
INFO:lda:<60> log likelihood: -779507
INFO:lda:<80> log likelihood: -776037
INFO:lda:<100> log likelihood: -772868
INFO:lda:<120> log likelihood: -770227
INFO:lda:<140> log likelihood: -768654
INFO:lda:<160> log likelihood: -766742
INFO:lda:<180> log likelihood: -765413
INFO:lda:<200> log likelihood: -764263
INFO:lda:<220> log likelihood: -763491
INFO:lda:<240> log likelihood: -762453
INFO:lda:<260> log likelihood: -761759
INFO:lda:<280> log likelihood: -760659
INFO:lda:<300> log likelihood: -759985
INFO:lda:<320> log likelihood: -759195
INFO:lda:<340> log likelihood: -758464
INFO:lda:<360> log likelihood: -757958
INFO:lda:<380> log likelihood: -756793
INFO:lda:<400> log likelihood: -756358
INFO:lda:<420> log likelihood: -755097
INFO:lda:<440> log likelihood: -754614
INFO:lda:<460> log likelihood: -753886
INFO:lda:<480> log likelihood: -754007
INFO:lda:<500> log likelihood: 

model fitted

getting topics


  df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
Alpha: 0.1 & Seed Confidence 0.05
fitting model


INFO:lda:<0> log likelihood: -968788
INFO:lda:<20> log likelihood: -785727
INFO:lda:<40> log likelihood: -777673
INFO:lda:<60> log likelihood: -771244
INFO:lda:<80> log likelihood: -768950
INFO:lda:<100> log likelihood: -765185
INFO:lda:<120> log likelihood: -763470
INFO:lda:<140> log likelihood: -761770
INFO:lda:<160> log likelihood: -760409
INFO:lda:<180> log likelihood: -759194
INFO:lda:<200> log likelihood: -758861
INFO:lda:<220> log likelihood: -757758
INFO:lda:<240> log likelihood: -757038
INFO:lda:<260> log likelihood: -756796
INFO:lda:<280> log likelihood: -755860
INFO:lda:<300> log likelihood: -755961
INFO:lda:<320> log likelihood: -755096
INFO:lda:<340> log likelihood: -755012
INFO:lda:<360> log likelihood: -754689
INFO:lda:<380> log likelihood: -754295
INFO:lda:<400> log likelihood: -754241
INFO:lda:<420> log likelihood: -754430
INFO:lda:<440> log likelihood: -753931
INFO:lda:<460> log likelihood: -753319
INFO:lda:<480> log likelihood: -753306
INFO:lda:<500> log likelihood: 

model fitted

getting topics


  df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
Alpha: 0.1 & Seed Confidence 0.25
fitting model


INFO:lda:<0> log likelihood: -968771
INFO:lda:<20> log likelihood: -785496
INFO:lda:<40> log likelihood: -776910
INFO:lda:<60> log likelihood: -771613
INFO:lda:<80> log likelihood: -768323
INFO:lda:<100> log likelihood: -765763
INFO:lda:<120> log likelihood: -764415
INFO:lda:<140> log likelihood: -762478
INFO:lda:<160> log likelihood: -760657
INFO:lda:<180> log likelihood: -759898
INFO:lda:<200> log likelihood: -759403
INFO:lda:<220> log likelihood: -758305
INFO:lda:<240> log likelihood: -757729
INFO:lda:<260> log likelihood: -757003
INFO:lda:<280> log likelihood: -756650
INFO:lda:<300> log likelihood: -756792
INFO:lda:<320> log likelihood: -756076
INFO:lda:<340> log likelihood: -755612
INFO:lda:<360> log likelihood: -755468
INFO:lda:<380> log likelihood: -755345
INFO:lda:<400> log likelihood: -755346
INFO:lda:<420> log likelihood: -754617
INFO:lda:<440> log likelihood: -754021
INFO:lda:<460> log likelihood: -753970
INFO:lda:<480> log likelihood: -754271
INFO:lda:<500> log likelihood: 

model fitted

getting topics


  df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
Alpha: 0.1 & Seed Confidence 0.5
fitting model


INFO:lda:<0> log likelihood: -968745
INFO:lda:<20> log likelihood: -785527
INFO:lda:<40> log likelihood: -777173
INFO:lda:<60> log likelihood: -772524
INFO:lda:<80> log likelihood: -769144
INFO:lda:<100> log likelihood: -766764
INFO:lda:<120> log likelihood: -765230
INFO:lda:<140> log likelihood: -763965
INFO:lda:<160> log likelihood: -762352
INFO:lda:<180> log likelihood: -761388
INFO:lda:<200> log likelihood: -760618
INFO:lda:<220> log likelihood: -759816
INFO:lda:<240> log likelihood: -759163
INFO:lda:<260> log likelihood: -758819
INFO:lda:<280> log likelihood: -758140
INFO:lda:<300> log likelihood: -758073
INFO:lda:<320> log likelihood: -757500
INFO:lda:<340> log likelihood: -757232
INFO:lda:<360> log likelihood: -757018
INFO:lda:<380> log likelihood: -755800
INFO:lda:<400> log likelihood: -755834
INFO:lda:<420> log likelihood: -755305
INFO:lda:<440> log likelihood: -755374
INFO:lda:<460> log likelihood: -755120
INFO:lda:<480> log likelihood: -755168
INFO:lda:<500> log likelihood: 

model fitted

getting topics


  df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
Alpha: 1 & Seed Confidence 0.05
fitting model


INFO:lda:<0> log likelihood: -957846
INFO:lda:<20> log likelihood: -779038
INFO:lda:<40> log likelihood: -771142
INFO:lda:<60> log likelihood: -767572
INFO:lda:<80> log likelihood: -765445
INFO:lda:<100> log likelihood: -763337
INFO:lda:<120> log likelihood: -762094
INFO:lda:<140> log likelihood: -761778
INFO:lda:<160> log likelihood: -761411
INFO:lda:<180> log likelihood: -760494
INFO:lda:<200> log likelihood: -759551
INFO:lda:<220> log likelihood: -759002
INFO:lda:<240> log likelihood: -758606
INFO:lda:<260> log likelihood: -759160
INFO:lda:<280> log likelihood: -758686
INFO:lda:<300> log likelihood: -758500
INFO:lda:<320> log likelihood: -757893
INFO:lda:<340> log likelihood: -758781
INFO:lda:<360> log likelihood: -758269
INFO:lda:<380> log likelihood: -758182
INFO:lda:<400> log likelihood: -758009
INFO:lda:<420> log likelihood: -757689
INFO:lda:<440> log likelihood: -757604
INFO:lda:<460> log likelihood: -757182
INFO:lda:<480> log likelihood: -757533
INFO:lda:<500> log likelihood: 

model fitted

getting topics


  df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
Alpha: 1 & Seed Confidence 0.25
fitting model


INFO:lda:<0> log likelihood: -957829
INFO:lda:<20> log likelihood: -779316
INFO:lda:<40> log likelihood: -772201
INFO:lda:<60> log likelihood: -767962
INFO:lda:<80> log likelihood: -766002
INFO:lda:<100> log likelihood: -764567
INFO:lda:<120> log likelihood: -763426
INFO:lda:<140> log likelihood: -761705
INFO:lda:<160> log likelihood: -761442
INFO:lda:<180> log likelihood: -761311
INFO:lda:<200> log likelihood: -760021
INFO:lda:<220> log likelihood: -759812
INFO:lda:<240> log likelihood: -759143
INFO:lda:<260> log likelihood: -759299
INFO:lda:<280> log likelihood: -757950
INFO:lda:<300> log likelihood: -758346
INFO:lda:<320> log likelihood: -757825
INFO:lda:<340> log likelihood: -757808
INFO:lda:<360> log likelihood: -757249
INFO:lda:<380> log likelihood: -757037
INFO:lda:<400> log likelihood: -757149
INFO:lda:<420> log likelihood: -756954
INFO:lda:<440> log likelihood: -756811
INFO:lda:<460> log likelihood: -756627
INFO:lda:<480> log likelihood: -756531
INFO:lda:<500> log likelihood: 

model fitted

getting topics


  df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
Alpha: 1 & Seed Confidence 0.5
fitting model


INFO:lda:<0> log likelihood: -957803
INFO:lda:<20> log likelihood: -778803
INFO:lda:<40> log likelihood: -770743
INFO:lda:<60> log likelihood: -767447
INFO:lda:<80> log likelihood: -764847
INFO:lda:<100> log likelihood: -762538
INFO:lda:<120> log likelihood: -761633
INFO:lda:<140> log likelihood: -761030
INFO:lda:<160> log likelihood: -760472
INFO:lda:<180> log likelihood: -760959
INFO:lda:<200> log likelihood: -759869
INFO:lda:<220> log likelihood: -759456
INFO:lda:<240> log likelihood: -758892
INFO:lda:<260> log likelihood: -758709
INFO:lda:<280> log likelihood: -758442
INFO:lda:<300> log likelihood: -757686
INFO:lda:<320> log likelihood: -757336
INFO:lda:<340> log likelihood: -757527
INFO:lda:<360> log likelihood: -757579
INFO:lda:<380> log likelihood: -756800
INFO:lda:<400> log likelihood: -756810
INFO:lda:<420> log likelihood: -756673
INFO:lda:<440> log likelihood: -757030
INFO:lda:<460> log likelihood: -756154
INFO:lda:<480> log likelihood: -756325
INFO:lda:<500> log likelihood: 

model fitted

getting topics


  df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


got topics
CPU times: user 3min 19s, sys: 2.11 s, total: 3min 21s
Wall time: 3min 19s


In [17]:
#turn prds into True/False
preds = document_df_test[['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']]
cols = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']

for col_name in cols:
    preds[col_name] =  preds[col_name] > 20

preds.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds[col_name] =  preds[col_name] > 20


Unnamed: 0,university,relationships,break ups,divorce,wedding,death,family,friendship
Document 0,False,False,False,False,True,False,False,True
Document 1,False,False,False,False,False,True,True,False
Document 2,False,True,False,False,False,False,False,False
Document 3,False,True,False,False,True,False,False,False
Document 4,False,False,False,False,True,False,False,True


## Seeing the model output topics and up to 10 words per topic

In [18]:
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: years murder one found truth killer another past house dead
Topic 1: love life family lives woman young mother heart man story
Topic 2: one world power war may human powerful find earth fight
Topic 3: become american man high war one two living boys much
Topic 4: new author york times bestselling home set tale secret yet
Topic 5: must world find dark city magic save battle evil adventure
Topic 6: story book one first novel stories series readers time books
Topic 7: life shes like never knows hes even one get new


## Assigning the Topics

In [19]:
topic_num_name = {"Topic 0":"university",
                  "Topic 1":"relationships",
                  "Topic 2":"breakups",
                  "Topic 3":"divorce",
                  "Topic 4":"wedding",
                  "Topic 5": "death",
                  "Topic 6": "family",
                  "Topic 7": "friends"}    

In [20]:
def get_doc_topics(model_glda,X,num_topics,dataframe,col_name):
    """
    A function which creates dataframe with documents, their dominant topic, along with their probabilities
    
    Parameters
    -------------
    model_glda - Guided LDA trained model
    X - Document term frequency table
    num_topics - Number of topics the model was trained for
    dataframe - Dataframe consisting of cleaned text column
    col_name - Column name in dataframe holding cleaned text
    
    Returns
    -------------
    A dataframe with document number, topic, probability of topic
    """
    df_doc_top = pd.DataFrame()
    final_list = []
    for index in range(len(dataframe[col_name])):
        word_id_dict = dict((x,y) for x,y in zip([x for x in range(num_topics)],np.round(model.transform(X[index])*100,1).tolist()[0]))
        word_score_list = []
        for index in range(num_topics):
            try:
                value = word_id_dict[index]
            except:
                value = 0
            word_score_list.append(value)
        final_list.append(word_score_list)

    df_doc_top = pd.DataFrame(final_list)
    df_doc_top.columns = ['Topic ' + str(i) for i in range(num_topics)]
    df_doc_top.index = ['Document ' + str(i) for i in range(len(dataframe[col_name]))]

    df_doc_top["Dominant_Topic"] = df_doc_top.idxmax(axis=1).tolist()
    df_doc_top["Topic_Probability"] = df_doc_top.max(axis=1).tolist()
    #document_df = df_doc_top.reset_index().rename(columns={"index":"Document"})[["Document","Dominant_Topic","Topic_Probability"]]


    #return document_df
    return df_doc_top

In [21]:
X.shape

(1000, 18481)

In [22]:
%%time

#ignore warning
import warnings
warnings.filterwarnings("ignore")

document_df=get_doc_topics(model,X,8,model_data,"liststring")



IndexError: row index (1000) out of range

In [23]:
document_df['description']= model_data['description']

NameError: name 'document_df' is not defined

In [None]:
#output in dictionary format for each row in df
    #key: topic; value: prob
document_df

In [None]:
document_df.Dominant_Topic.value_counts()

## Run the Test Set

In [24]:
#import test set

#import data
model_data_test = pd.read_csv('LDA_test.csv')
model_data_test = model_data_test.replace(np.nan, False)

print('Tokenized Text DF Size:', len(model_data_test))

Tokenized Text DF Size: 1000


In [25]:
model_data_test.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'index', 'Unnamed: 0', 'bookId',
       'title', 'series', 'author', 'rating', 'description', 'language',
       'isbn', 'genres', 'characters', 'bookFormat', 'edition', 'pages',
       'publisher', 'publishDate', 'firstPublishDate', 'awards', 'numRatings',
       'ratingsByStars', 'likedPercent', 'setting', 'coverImg', 'bbeScore',
       'bbeVotes', 'price', 'Fiction', 'Nonfiction', 'Young Adult',
       'Childrens', 'New Adult', 'Fantasy', 'Erotica', 'History', 'Dystopia',
       'Poetry', 'Biography', 'Manga', 'Thriller', 'Graphic Novels', 'Romance',
       'university', 'relationships', 'break ups', 'divorce', 'weddings',
       'death', 'family', 'friendship', 'labeled? ', 'Contains True?'],
      dtype='object')

In [26]:
%%time
model_data_test['tokens'] = model_data_test['description'].apply(lambda x: preprocess_text(x))

#create objects required for model training

model_data_test['liststring'] = [','.join(map(str, l)) for l in model_data_test['tokens']]
corpus=model_data_test['liststring'].tolist()
vocab=list(set(word_tokenize(" ".join(model_data_test['liststring']))))
vectorizer = CountVectorizer(ngram_range=(1,2),vocabulary=vocab)
X = vectorizer.fit_transform(corpus)
word2id=vectorizer.vocabulary_


CPU times: user 10.6 s, sys: 854 ms, total: 11.5 s
Wall time: 11.5 s


In [27]:
#synsets unigrams only

topic_list = ['universities', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']

synsets = [['college', 'university', 'campus', 'professor', 'professors'], 
           ['date', 'relationship', 'romance', 'dating'],
           ['breakup', 'dumped', 'dump'], 
           ['divorce', 'divorced'], 
           ['wedding', 'wedding', 'wedding', 'marry', 'wed', 'marry', 'wed'], 
           ['death', 'deceased', 'dying'],
           ['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad'],
           ['friends', 'friend', 'friendship', 'friendships']]


seed_topics = {}
for t_id, st in enumerate(synsets):
    for word in st:
        seed_topics[word2id[word]] = t_id

In [28]:
document_df_test=get_doc_topics(model,X,8,model_data_test,"liststring")



In [29]:
cols = ['university', 'relationships', 'break ups', 'divorce', 'weddings', 'death', 'family', 'friendship']
for col_name in cols:
    model_data_test[col_name] =  model_data_test[col_name].fillna(False)

model_data_test.columns = ['Unnamed: 0.2', 'Unnamed: 0.1', 'index', 'Unnamed: 0', 'bookId',
       'title', 'series', 'author', 'rating', 'description', 'language',
       'isbn', 'genres', 'characters', 'bookFormat', 'edition', 'pages',
       'publisher', 'publishDate', 'firstPublishDate', 'awards', 'numRatings',
       'ratingsByStars', 'likedPercent', 'setting', 'coverImg', 'bbeScore',
       'bbeVotes', 'price', 'Fiction', 'Nonfiction', 'Young Adult',
       'Childrens', 'New Adult', 'Fantasy', 'Erotica', 'History', 'Dystopia',
       'Poetry', 'Biography', 'Manga', 'Thriller', 'Graphic Novels', 'Romance',
       'university', 'relationships', 'break ups', 'divorce', 'wedding',
       'death', 'family', 'friendship', 'labeled? ', 'Contains True?',
       'tokens', 'liststring']

In [30]:
preds.head()

Unnamed: 0,university,relationships,break ups,divorce,wedding,death,family,friendship
Document 0,False,False,False,False,True,False,False,True
Document 1,False,False,False,False,False,True,True,False
Document 2,False,True,False,False,False,False,False,False
Document 3,False,True,False,False,True,False,False,False
Document 4,False,False,False,False,True,False,False,True


In [31]:
preds.dtypes

university       bool
relationships    bool
break ups        bool
divorce          bool
wedding          bool
death            bool
family           bool
friendship       bool
dtype: object

In [32]:
model_data_test.dtypes

Unnamed: 0.2          int64
Unnamed: 0.1          int64
index                 int64
Unnamed: 0           object
bookId               object
title                object
series               object
author               object
rating               object
description          object
language             object
isbn                 object
genres               object
characters           object
bookFormat           object
edition              object
pages                object
publisher            object
publishDate          object
firstPublishDate     object
awards               object
numRatings           object
ratingsByStars       object
likedPercent        float64
setting              object
coverImg             object
bbeScore             object
bbeVotes             object
price                object
Fiction              object
Nonfiction             bool
Young Adult          object
Childrens            object
New Adult            object
Fantasy              object
Erotica             

In [33]:
model_data_test

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,index,Unnamed: 0,bookId,title,series,author,rating,description,...,break ups,divorce,wedding,death,family,friendship,labeled?,Contains True?,tokens,liststring
0,0,0,0,39822,34838660-not-part-of-the-plan,Not Part of the Plan,Blue Moon #4,Lucy Score (Goodreads Author),4.46,From the Wall Street Journal and #1 Amazon bes...,...,False,False,False,False,False,False,Yes,1.0,"[wall, street, journal, amazon, bestselling, a...","wall,street,journal,amazon,bestselling,author,..."
1,1,1,1,34235,20176552-dragon-age-volume-1,"Dragon Age, Volume 1",Dragon Age Graphic Novels #1-3,"David Gaider, Chad Hardin (Illustrator), Antho...",4.26,Helping set the stage for BioWare's hotly anti...,...,False,False,False,False,False,False,Yes,0.0,"[helping, set, stage, biowares, hotly, anticip...","helping,set,stage,biowares,hotly,anticipated,d..."
2,2,2,2,27904,124110.Dangerous_to_Know,Dangerous to Know,False,Barbara Taylor Bradford (Goodreads Author),3.73,"Sebastian Locke, the fifty-six-year-old patria...",...,False,True,False,True,True,False,Yes,1.0,"[sebastian, locke, fiftysixyearold, patriarch,...","sebastian,locke,fiftysixyearold,patriarch,powe..."
3,3,3,3,10515,1046450.The_Wheel_of_Fortune,The Wheel of Fortune,False,Susan Howatch,4.11,"""Take me back to Oxmoon, the lost paradise of ...",...,False,False,False,False,True,False,Yes,1.0,"[take, back, oxmoon, lost, paradise, childhood...","take,back,oxmoon,lost,paradise,childhood,take,..."
4,4,4,4,935,872333.Blue_Bloods,Blue Bloods,Blue Bloods #1,Melissa de la Cruz (Goodreads Author),3.69,"When the Mayflower set sail in 1620, it carrie...",...,False,False,False,False,False,False,Yes,0.0,"[mayflower, set, sail, carried, board, men, wo...","mayflower,set,sail,carried,board,men,women,wou..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,995,995,17361,588326.The_Blue_Helmet,The Blue Helmet,False,William Bell,3.42,Lee wants to be a Tarantula – a member of the ...,...,False,False,False,False,False,True,False,1.0,"[lee, wants, tarantula, member, biggest, power...","lee,wants,tarantula,member,biggest,powerful,ga..."
996,996,996,996,9029,93007.The_Merry_Adventures_of_Robin_Hood,The Merry Adventures of Robin Hood,False,Howard Pyle,4.07,The Merry Adventures of Robin Hood of Great Re...,...,False,False,False,False,False,False,False,0.0,"[merry, adventures, robin, hood, great, renown...","merry,adventures,robin,hood,great,renown,notti..."
997,997,997,997,32216,1085376.Before_You_Sleep,Before You Sleep,False,"Linn Ullmann, Tiina Nunnally (Translator)",3.34,Moving from present-day Oslo to Brooklyn in th...,...,False,False,False,False,True,False,False,1.0,"[moving, presentday, oslo, brooklyn, sleep, te...","moving,presentday,oslo,brooklyn,sleep,tells,st..."
998,998,998,998,1036,28195.Inkspell,Inkspell,Inkworld #2,"Cornelia Funke (Goodreads Author), Anthea Bell...",3.91,"The captivating sequel to INKHEART, the critic...",...,False,False,False,False,False,False,False,0.0,"[captivating, sequel, inkheart, critically, ac...","captivating,sequel,inkheart,critically,acclaim..."


## Next Step: Chart of seed_confidence to accuracy

In [34]:
%%time

alphas = [0.01, 0.1, 1]
seed_confidences = [0.05, 0.25, 0.5]

results = {}

for alpha in alphas:
    for seed_confidence in seed_confidences:
        
        print('Alpha:', str(alpha), '& Seed Confidence', str(seed_confidence))
        
        #define model
        model = guidedlda.GuidedLDA(n_topics=8, n_iter=2000, random_state=7, refresh=20,alpha=alpha,eta=0.01)
        
        #fit model
        print('fitting model')
        model.fit(X, seed_topics=seed_topics, seed_confidence=seed_confidence)
        print('model fitted')
        print('')
        
        #get topics
        print('getting topics')
        document_df_test=get_doc_topics(model,X,8,model_data_test,"liststring")
        topic_list = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship', 'Dominant_Topic', 'Topic_Probability']
        document_df_test.columns = topic_list
        print('got topics')
        
        #convert preds to 
        preds = document_df_test[['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']]
        print(preds.head())
        model_data_test_reduced = model_data_test[['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']]
        print(model_data_test_reduced.head())
        cols = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']

        #turn preds into True/False
        for col_name in cols:
            preds[col_name] =  preds[col_name] > 20
            
        #create dict key
        #key_name = 'alpha_' + str(alpha) + '_seedconf_' + str(seed_confidence)
        
        y_true = np.array(preds.values.tolist())
        y_pred = np.array(model_data_test_reduced.values.tolist())
        
        print(y_true)
        print(y_pred)

        print(classification_report(
            model_data_test_reduced,
            preds,
            #output_dict=True,
            target_names=['university', 'relationships', 'break ups', 'divorce', 'wedding', 'death', 'family', 'friendship']))
        


INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


Alpha: 0.01 & Seed Confidence 0.05
fitting model


INFO:lda:<0> log likelihood: -983615
INFO:lda:<20> log likelihood: -793707
INFO:lda:<40> log likelihood: -785034
INFO:lda:<60> log likelihood: -780247
INFO:lda:<80> log likelihood: -776246
INFO:lda:<100> log likelihood: -773792
INFO:lda:<120> log likelihood: -771742
INFO:lda:<140> log likelihood: -769756
INFO:lda:<160> log likelihood: -767657
INFO:lda:<180> log likelihood: -766438
INFO:lda:<200> log likelihood: -764720
INFO:lda:<220> log likelihood: -763855
INFO:lda:<240> log likelihood: -763008
INFO:lda:<260> log likelihood: -762122
INFO:lda:<280> log likelihood: -760968
INFO:lda:<300> log likelihood: -760605
INFO:lda:<320> log likelihood: -759208
INFO:lda:<340> log likelihood: -758494
INFO:lda:<360> log likelihood: -757919
INFO:lda:<380> log likelihood: -756862
INFO:lda:<400> log likelihood: -756564
INFO:lda:<420> log likelihood: -756495
INFO:lda:<440> log likelihood: -755516
INFO:lda:<460> log likelihood: -754917
INFO:lda:<480> log likelihood: -755249
INFO:lda:<500> log likelihood: 

model fitted

getting topics


INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
            university  relationships  break ups  divorce  wedding  death  \
Document 0         0.0            0.0        0.0      0.0      0.0    0.0   
Document 1         0.0           64.8        0.0      0.0      0.0   26.9   
Document 2        45.4            0.0        0.0      0.0      0.0    7.4   
Document 3        45.3            9.3        0.0      0.0      0.0   10.5   
Document 4         0.0            0.0        0.0      0.0     22.6   20.0   

            family  friendship  
Document 0     9.2        90.8  
Document 1     8.2         0.0  
Document 2     0.0        47.1  
Document 3     0.0        34.9  
Document 4     1.9        55.5  
   university  relationships  break ups  divorce  wedding  death  family  \
0       False           True      False    False    False  False   False   
1       False          False      False    False    False  False   False   
2       False           True      False     True    False   True    True   
3       False           

INFO:lda:<0> log likelihood: -983375
INFO:lda:<20> log likelihood: -793657
INFO:lda:<40> log likelihood: -785060
INFO:lda:<60> log likelihood: -779657
INFO:lda:<80> log likelihood: -776256
INFO:lda:<100> log likelihood: -773296
INFO:lda:<120> log likelihood: -771352
INFO:lda:<140> log likelihood: -769609
INFO:lda:<160> log likelihood: -767987
INFO:lda:<180> log likelihood: -767230
INFO:lda:<200> log likelihood: -766066
INFO:lda:<220> log likelihood: -765234
INFO:lda:<240> log likelihood: -763459
INFO:lda:<260> log likelihood: -762580
INFO:lda:<280> log likelihood: -762078
INFO:lda:<300> log likelihood: -761516
INFO:lda:<320> log likelihood: -760473
INFO:lda:<340> log likelihood: -760326
INFO:lda:<360> log likelihood: -759431
INFO:lda:<380> log likelihood: -758664
INFO:lda:<400> log likelihood: -757978
INFO:lda:<420> log likelihood: -757774
INFO:lda:<440> log likelihood: -756986
INFO:lda:<460> log likelihood: -756844
INFO:lda:<480> log likelihood: -756487
INFO:lda:<500> log likelihood: 

model fitted

getting topics


INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
            university  relationships  break ups  divorce  wedding  death  \
Document 0         0.0            0.0        0.0      0.0     10.8    0.5   
Document 1         5.4           42.4        0.0      0.0     52.1    0.0   
Document 2         0.0            0.0        0.0      1.9      0.0    0.0   
Document 3         0.0           26.9        0.0      0.0      0.0    0.0   
Document 4         0.0            0.0       19.2     21.0      0.0    2.1   

            family  friendship  
Document 0     0.0        88.6  
Document 1     0.0         0.0  
Document 2    39.1        58.9  
Document 3    23.2        49.9  
Document 4     0.0        57.8  
   university  relationships  break ups  divorce  wedding  death  family  \
0       False           True      False    False    False  False   False   
1       False          False      False    False    False  False   False   
2       False           True      False     True    False   True    True   
3       False           

INFO:lda:<0> log likelihood: -982799
INFO:lda:<20> log likelihood: -794743
INFO:lda:<40> log likelihood: -785957
INFO:lda:<60> log likelihood: -781320
INFO:lda:<80> log likelihood: -777952
INFO:lda:<100> log likelihood: -774371
INFO:lda:<120> log likelihood: -772468
INFO:lda:<140> log likelihood: -770548
INFO:lda:<160> log likelihood: -769525
INFO:lda:<180> log likelihood: -768238
INFO:lda:<200> log likelihood: -767083
INFO:lda:<220> log likelihood: -765836
INFO:lda:<240> log likelihood: -765019
INFO:lda:<260> log likelihood: -764135
INFO:lda:<280> log likelihood: -763369
INFO:lda:<300> log likelihood: -763248
INFO:lda:<320> log likelihood: -762462
INFO:lda:<340> log likelihood: -761925
INFO:lda:<360> log likelihood: -761222
INFO:lda:<380> log likelihood: -760468
INFO:lda:<400> log likelihood: -758796
INFO:lda:<420> log likelihood: -758175
INFO:lda:<440> log likelihood: -757780
INFO:lda:<460> log likelihood: -756803
INFO:lda:<480> log likelihood: -757047
INFO:lda:<500> log likelihood: 

model fitted

getting topics


INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
            university  relationships  break ups  divorce  wedding  death  \
Document 0         0.0            0.0        0.0      0.0      8.9    0.0   
Document 1         0.0            0.0        0.0     56.4      0.0   31.0   
Document 2         0.0           27.1        0.0      1.2      9.8    4.2   
Document 3         0.0            0.0        0.0      0.0     22.3    0.0   
Document 4        26.3            0.0        0.0      0.0     12.4    4.4   

            family  friendship  
Document 0     3.8        87.3  
Document 1     0.1        12.5  
Document 2    11.3        46.3  
Document 3    30.5        47.2  
Document 4     0.5        56.3  
   university  relationships  break ups  divorce  wedding  death  family  \
0       False           True      False    False    False  False   False   
1       False          False      False    False    False  False   False   
2       False           True      False     True    False   True    True   
3       False           

INFO:lda:<0> log likelihood: -968773
INFO:lda:<20> log likelihood: -785538
INFO:lda:<40> log likelihood: -777495
INFO:lda:<60> log likelihood: -771814
INFO:lda:<80> log likelihood: -769208
INFO:lda:<100> log likelihood: -766567
INFO:lda:<120> log likelihood: -765141
INFO:lda:<140> log likelihood: -762763
INFO:lda:<160> log likelihood: -761691
INFO:lda:<180> log likelihood: -760453
INFO:lda:<200> log likelihood: -759800
INFO:lda:<220> log likelihood: -758554
INFO:lda:<240> log likelihood: -758225
INFO:lda:<260> log likelihood: -757766
INFO:lda:<280> log likelihood: -757012
INFO:lda:<300> log likelihood: -757065
INFO:lda:<320> log likelihood: -756165
INFO:lda:<340> log likelihood: -755218
INFO:lda:<360> log likelihood: -754739
INFO:lda:<380> log likelihood: -754975
INFO:lda:<400> log likelihood: -754616
INFO:lda:<420> log likelihood: -754748
INFO:lda:<440> log likelihood: -754089
INFO:lda:<460> log likelihood: -753541
INFO:lda:<480> log likelihood: -753812
INFO:lda:<500> log likelihood: 

model fitted

getting topics


INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
            university  relationships  break ups  divorce  wedding  death  \
Document 0         0.0            0.1        0.0      0.0     14.6    0.0   
Document 1         0.0           93.5        0.1      0.1      0.0    0.3   
Document 2        34.4            0.3       20.2     11.6      0.0    0.0   
Document 3        36.7            0.1       12.3      8.4      9.3    8.6   
Document 4        31.3            1.1        0.2      0.1      0.0   19.3   

            family  friendship  
Document 0     0.1        85.1  
Document 1     0.0         5.9  
Document 2     7.8        25.8  
Document 3     8.4        16.1  
Document 4     4.8        43.2  
   university  relationships  break ups  divorce  wedding  death  family  \
0       False           True      False    False    False  False   False   
1       False          False      False    False    False  False   False   
2       False           True      False     True    False   True    True   
3       False           

INFO:lda:<0> log likelihood: -968534
INFO:lda:<20> log likelihood: -786274
INFO:lda:<40> log likelihood: -776898
INFO:lda:<60> log likelihood: -772285
INFO:lda:<80> log likelihood: -769183
INFO:lda:<100> log likelihood: -766044
INFO:lda:<120> log likelihood: -764891
INFO:lda:<140> log likelihood: -763540
INFO:lda:<160> log likelihood: -762024
INFO:lda:<180> log likelihood: -761326
INFO:lda:<200> log likelihood: -760301
INFO:lda:<220> log likelihood: -759300
INFO:lda:<240> log likelihood: -757921
INFO:lda:<260> log likelihood: -757831
INFO:lda:<280> log likelihood: -756978
INFO:lda:<300> log likelihood: -757124
INFO:lda:<320> log likelihood: -756114
INFO:lda:<340> log likelihood: -755274
INFO:lda:<360> log likelihood: -755054
INFO:lda:<380> log likelihood: -754468
INFO:lda:<400> log likelihood: -754505
INFO:lda:<420> log likelihood: -754263
INFO:lda:<440> log likelihood: -753611
INFO:lda:<460> log likelihood: -753517
INFO:lda:<480> log likelihood: -753951
INFO:lda:<500> log likelihood: 

model fitted

getting topics


INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
            university  relationships  break ups  divorce  wedding  death  \
Document 0        17.9            1.1        0.0      4.2      0.0    1.4   
Document 1         4.1           68.1        0.0      2.2      0.0   25.2   
Document 2        11.2            4.3       17.1      3.0      0.3    0.7   
Document 3        48.1            3.7        0.0      0.1      0.0    4.5   
Document 4        27.7            0.4        0.1     17.6      7.1   12.4   

            family  friendship  
Document 0     0.0        75.3  
Document 1     0.2         0.1  
Document 2    39.4        24.1  
Document 3    20.7        22.8  
Document 4     0.0        34.7  
   university  relationships  break ups  divorce  wedding  death  family  \
0       False           True      False    False    False  False   False   
1       False          False      False    False    False  False   False   
2       False           True      False     True    False   True    True   
3       False           

INFO:lda:<0> log likelihood: -967960
INFO:lda:<20> log likelihood: -785184
INFO:lda:<40> log likelihood: -777044
INFO:lda:<60> log likelihood: -772031
INFO:lda:<80> log likelihood: -768956
INFO:lda:<100> log likelihood: -766882
INFO:lda:<120> log likelihood: -765772
INFO:lda:<140> log likelihood: -764276
INFO:lda:<160> log likelihood: -763815
INFO:lda:<180> log likelihood: -762387
INFO:lda:<200> log likelihood: -761154
INFO:lda:<220> log likelihood: -760860
INFO:lda:<240> log likelihood: -760575
INFO:lda:<260> log likelihood: -759764
INFO:lda:<280> log likelihood: -759282
INFO:lda:<300> log likelihood: -759243
INFO:lda:<320> log likelihood: -758048
INFO:lda:<340> log likelihood: -758145
INFO:lda:<360> log likelihood: -757511
INFO:lda:<380> log likelihood: -757734
INFO:lda:<400> log likelihood: -757150
INFO:lda:<420> log likelihood: -757272
INFO:lda:<440> log likelihood: -756643
INFO:lda:<460> log likelihood: -756514
INFO:lda:<480> log likelihood: -756633
INFO:lda:<500> log likelihood: 

model fitted

getting topics


INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
            university  relationships  break ups  divorce  wedding  death  \
Document 0         0.0            8.0        6.2      0.0      0.0    7.5   
Document 1         1.4           51.1        0.1     11.7     19.6    3.4   
Document 2        34.8            7.2       19.2      3.2      0.1    0.2   
Document 3         9.8            1.8        0.1      0.0      8.9   51.0   
Document 4        10.7            0.3        0.1     19.4     12.9   21.4   

            family  friendship  
Document 0     0.0        78.2  
Document 1    12.5         0.2  
Document 2     9.4        25.9  
Document 3     8.1        20.3  
Document 4     3.4        31.8  
   university  relationships  break ups  divorce  wedding  death  family  \
0       False           True      False    False    False  False   False   
1       False          False      False    False    False  False   False   
2       False           True      False     True    False   True    True   
3       False           

INFO:lda:<0> log likelihood: -957831
INFO:lda:<20> log likelihood: -779970
INFO:lda:<40> log likelihood: -771762
INFO:lda:<60> log likelihood: -768099
INFO:lda:<80> log likelihood: -766910
INFO:lda:<100> log likelihood: -764740
INFO:lda:<120> log likelihood: -763130
INFO:lda:<140> log likelihood: -762197
INFO:lda:<160> log likelihood: -761058
INFO:lda:<180> log likelihood: -759951
INFO:lda:<200> log likelihood: -759421
INFO:lda:<220> log likelihood: -759702
INFO:lda:<240> log likelihood: -758695
INFO:lda:<260> log likelihood: -758867
INFO:lda:<280> log likelihood: -758618
INFO:lda:<300> log likelihood: -759121
INFO:lda:<320> log likelihood: -758488
INFO:lda:<340> log likelihood: -758374
INFO:lda:<360> log likelihood: -758371
INFO:lda:<380> log likelihood: -757754
INFO:lda:<400> log likelihood: -757133
INFO:lda:<420> log likelihood: -757544
INFO:lda:<440> log likelihood: -757294
INFO:lda:<460> log likelihood: -757368
INFO:lda:<480> log likelihood: -757353
INFO:lda:<500> log likelihood: 

model fitted

getting topics


INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
            university  relationships  break ups  divorce  wedding  death  \
Document 0         1.5            9.7       15.0      4.1      1.9    7.2   
Document 1         1.4            1.9        6.7      0.2     20.6   14.1   
Document 2        17.4           41.5        5.6     13.5      1.4    5.8   
Document 3         1.8           34.7       10.7     20.0      5.2   12.5   
Document 4        22.0           10.5        9.3     14.0     12.6    4.6   

            family  friendship  
Document 0     0.1        60.5  
Document 1    54.9         0.1  
Document 2     6.9         7.8  
Document 3     4.3        10.8  
Document 4     2.6        24.4  
   university  relationships  break ups  divorce  wedding  death  family  \
0       False           True      False    False    False  False   False   
1       False          False      False    False    False  False   False   
2       False           True      False     True    False   True    True   
3       False           

INFO:lda:<0> log likelihood: -957595
INFO:lda:<20> log likelihood: -779330
INFO:lda:<40> log likelihood: -772187
INFO:lda:<60> log likelihood: -768329
INFO:lda:<80> log likelihood: -766097
INFO:lda:<100> log likelihood: -764756
INFO:lda:<120> log likelihood: -763592
INFO:lda:<140> log likelihood: -762207
INFO:lda:<160> log likelihood: -761707
INFO:lda:<180> log likelihood: -761575
INFO:lda:<200> log likelihood: -760737
INFO:lda:<220> log likelihood: -759994
INFO:lda:<240> log likelihood: -759173
INFO:lda:<260> log likelihood: -759138
INFO:lda:<280> log likelihood: -758753
INFO:lda:<300> log likelihood: -758834
INFO:lda:<320> log likelihood: -758374
INFO:lda:<340> log likelihood: -758565
INFO:lda:<360> log likelihood: -758021
INFO:lda:<380> log likelihood: -757849
INFO:lda:<400> log likelihood: -757549
INFO:lda:<420> log likelihood: -758032
INFO:lda:<440> log likelihood: -756986
INFO:lda:<460> log likelihood: -757209
INFO:lda:<480> log likelihood: -757502
INFO:lda:<500> log likelihood: 

model fitted

getting topics


INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 18481
INFO:lda:n_words: 77613
INFO:lda:n_topics: 8
INFO:lda:n_iter: 2000


got topics
            university  relationships  break ups  divorce  wedding  death  \
Document 0        13.7            0.1       16.7      5.3      1.4    1.1   
Document 1         1.0           59.0        8.7      2.1      0.2   22.3   
Document 2        27.9           15.2        6.6      0.4     12.7    7.2   
Document 3        11.7           15.8        5.2     14.7      1.5    5.4   
Document 4        22.2            5.2        5.0     30.4     10.9    9.8   

            family  friendship  
Document 0     3.9        57.7  
Document 1     6.5         0.3  
Document 2    15.7        14.3  
Document 3    34.6        11.1  
Document 4     0.3        16.2  
   university  relationships  break ups  divorce  wedding  death  family  \
0       False           True      False    False    False  False   False   
1       False          False      False    False    False  False   False   
2       False           True      False     True    False   True    True   
3       False           

INFO:lda:<0> log likelihood: -957028
INFO:lda:<20> log likelihood: -779344
INFO:lda:<40> log likelihood: -772082
INFO:lda:<60> log likelihood: -768465
INFO:lda:<80> log likelihood: -766674
INFO:lda:<100> log likelihood: -764705
INFO:lda:<120> log likelihood: -763671
INFO:lda:<140> log likelihood: -762781
INFO:lda:<160> log likelihood: -762185
INFO:lda:<180> log likelihood: -761617
INFO:lda:<200> log likelihood: -760397
INFO:lda:<220> log likelihood: -760324
INFO:lda:<240> log likelihood: -759195
INFO:lda:<260> log likelihood: -759686
INFO:lda:<280> log likelihood: -759167
INFO:lda:<300> log likelihood: -759578
INFO:lda:<320> log likelihood: -758905
INFO:lda:<340> log likelihood: -758938
INFO:lda:<360> log likelihood: -758692
INFO:lda:<380> log likelihood: -758500
INFO:lda:<400> log likelihood: -758092
INFO:lda:<420> log likelihood: -758239
INFO:lda:<440> log likelihood: -757081
INFO:lda:<460> log likelihood: -757584
INFO:lda:<480> log likelihood: -757092
INFO:lda:<500> log likelihood: 

model fitted

getting topics




got topics
            university  relationships  break ups  divorce  wedding  death  \
Document 0        10.3            0.3        5.5      7.7     14.0    0.1   
Document 1         2.1           59.6       23.1      0.3      9.6    1.4   
Document 2        36.6            4.8        2.7     13.5     13.9    5.8   
Document 3        36.3           13.7        5.6     11.0      8.6    1.6   
Document 4        30.8            3.5       15.0      1.0     25.4    5.8   

            family  friendship  
Document 0     5.9        56.1  
Document 1     3.7         0.2  
Document 2    16.8         6.0  
Document 3    20.6         2.6  
Document 4     1.7        16.8  
   university  relationships  break ups  divorce  wedding  death  family  \
0       False           True      False    False    False  False   False   
1       False          False      False    False    False  False   False   
2       False           True      False     True    False   True    True   
3       False           

## Plot accuracies

In [35]:
#make df
alpha_dict = {'alpha': [0.01, 0.1, 1], 'macro_avg_precision': [0.07, 0.09, 0.09]}
df_alpha = pd.DataFrame(data=alpha_dict)

#plot df
df_alpha.plot.line(x='alpha', y='macro_avg_precision', ylim=(0,1), ylabel='Micro Average Precision', xlabel='Alpha', title='Micro Average Precision by Alpha', legend=False)

KeyError: 'micro_avg_precision'

In [None]:
#make df
seed_confidence_dict = {'seed_confidence': [0.05, 0.25, 0.5], 'micro_avg_precision': [0.07, 0.08, 0.09]}
df_seed_confidence = pd.DataFrame(data=seed_confidence_dict)

#plot df
df_seed_confidence.plot.line(x='seed_confidence', y='micro_avg_precision', ylim=(0,1), ylabel='Micro Average Precision', xlabel='Seed Confidence', title='Micro Average Precision by Alpha', legend=False)