# <center>Implementing LDA in Python</center>

<center>Dr. W.J.B. Mattingly</center>

<center>Smithsonian Data Science Lab and United States Holocaust Memorial Museum</center>

<center>February 2021</center>

## Key Concepts in this Notebook

## Introduction

## Importing the Required Libraries

In [27]:
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
import numpy as np
import json
import glob
import os

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords


#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Preparing the Data

In [3]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        #data = json.load(f) 
        data=f.read()
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


In [4]:
stopwords = stopwords.words("english")

In [5]:
print (stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
#data = load_data("ushmm_dn.json")["texts"]
text_list=[f for f in os.listdir("elohp-interview-text") if not f.startswith('.')]
data=[]
for f in text_list:
    #print(f)
    file_data=load_data("elohp-interview-text/"+f)
    data.append(file_data)
    #data = load_data("elohp-interview-text/Coll520_do001_interviewtext.txt")

#print(data[0][0:90])

In [7]:
print(data[0][0:900])

Long:  This oral history interview is part of the Eugene Lesbian Oral 
History Project. The recordings will be made available through the 
University of Oregon Libraries’  Special Collections and University 
Archives. This is an oral history interview with Kate Barry  on July 
30, 2019, taking place in the University of Oregon Libraries’ 
recording studio in the Center for Media and Educational 
Technologies. The interviewers are Linda Long, Curator of 
Manuscripts in the UO  Library's Special Collections and University 
Archives, and Professor Judith Raiskin, of the UO  Department of 
Women's Gender and Sexuality Studies. Kate, please let us know if 
you agree to be recorded  for this project and that you give your 
permission for the university to preserve and make available your 
recorded and transcribed interview.  
Barry:  I do.   
Long:  All right, thank you very much. Let's just s


Install necessary model

In [8]:
#!python3 -m spacy download en_core_web_sm

Next cell takes a few minutes to load

In [9]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatized_texts = lemmatization(data)

In [12]:
print (lemmatized_texts[0][0:200])

long oral history interview part oral recording make available oral history interview take place recording studio educational interviewer let know agree record project give permission university prese


In [13]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

In [26]:
print (data_words[0][0:200])

['long', 'oral', 'history', 'interview', 'part', 'oral', 'recording', 'make', 'available', 'oral', 'history', 'interview', 'take', 'place', 'recording', 'studio', 'educational', 'interviewer', 'let', 'know', 'agree', 'record', 'project', 'give', 'permission', 'university', 'preserve', 'make', 'available', 'record', 'transcribed', 'interview', 'long', 'all', 'right', 'thank', 'very', 'much', 'let', 'just', 'start', 'basic', 'question', 'tell', 'bear', 'grow', 'early', 'background', 'bear', 'little', 'mining', 'town', 'call', 'northeast', 'coal', 'country', 'grow', 'go', 'school', 'there', 'parent', 'dad', 'miner', 'grandfather', 'miner', 'invalid', 'mine', 'black', 'lung', 'so', 'grandmother', 'paternal', 'grandmother', 'determined', 'son', 'go', 'mine', 'so', 'train', 'electrician', 'course', 'only', 'work', 'mine', 'so', 'end', 'anyway', 'then', 'once', 'electricity', 'nationalize', 'then', 'job', 'above', 'ground', 'so', 'father', 'work', 'lineman', 'foreman', 'electrician', 'ground'

In [17]:
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=150)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=150)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

In [18]:
print (data_bigrams_trigrams[0][:100])

['long', 'oral_history_interview', 'part', 'oral_recording', 'make', 'available_oral_history', 'interview', 'take', 'place', 'recording_studio_educational_interviewer', 'let', 'know', 'agree_record_project', 'give_permission_university_preserve', 'make', 'available_record_transcribed_interview', 'long', 'all', 'right', 'thank', 'very', 'much', 'let', 'just', 'start', 'basic_question', 'tell', 'bear', 'grow', 'early', 'background', 'bear', 'little', 'mining', 'town', 'call', 'northeast', 'coal', 'country', 'grow', 'go', 'school', 'there', 'parent', 'dad', 'miner', 'grandfather', 'miner', 'invalid', 'mine', 'black', 'lung', 'so', 'grandmother', 'paternal', 'grandmother', 'determined', 'son', 'go', 'mine', 'so', 'train', 'electrician', 'course', 'only', 'work', 'mine', 'so', 'end', 'anyway', 'then', 'once', 'electricity', 'nationalize', 'then', 'job', 'above', 'ground', 'so', 'father', 'work', 'lineman', 'foreman', 'electrician', 'ground', 'raiskin', 'year', 'bear', 'year', 'bear', 'bear'

In [19]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

#low_value = 0.03 #threshold number
low_value = 0.04 #threshold number
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow


In [20]:
# id2word = corpora.Dictionary(data_words)

# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

# print (corpus[0][0:20])

# word = id2word[[0][:1][0]]
# print (word)

In [30]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=25,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=70,
                                           alpha="auto")


https://neptune.ai/blog/pyldavis-topic-modelling-exploration-tool-that-every-nlp-data-scientist-should-know

## Vizualizing the Data

In [31]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(


For making binder notebook: https://www.youtube.com/watch?v=owSGVOov9pQ

In [139]:
print(corpus[0][0:100])
print(id2word[2])
print(data[0][1000:2000])
print(lemmatized_texts[0][1000:2000])

[(3, 2), (9, 6), (16, 2), (20, 4), (26, 1), (27, 4), (28, 3), (34, 2), (38, 1), (42, 3), (61, 1), (65, 3), (72, 2), (76, 1), (86, 2), (95, 3), (102, 2), (112, 3), (118, 8), (119, 3), (132, 1), (134, 2), (140, 1), (141, 3), (144, 2), (153, 1), (154, 12), (156, 1), (161, 1), (164, 3), (167, 2), (169, 3), (170, 2), (194, 10), (203, 2), (215, 24), (218, 2), (220, 3), (225, 4), (234, 2), (237, 1), (245, 2), (252, 2), (255, 1), (258, 6), (266, 1), (267, 1), (272, 2), (275, 2), (277, 1), (280, 8), (285, 1), (286, 4), (288, 3), (307, 5), (320, 4), (322, 6), (323, 11), (325, 1), (327, 7), (329, 1), (333, 2), (334, 1), (336, 7), (345, 4), (348, 1), (349, 4), (353, 2), (354, 5), (358, 2), (361, 5), (362, 2), (363, 1), (368, 4), (373, 5), (378, 3), (380, 2), (394, 14), (395, 4), (397, 2), (399, 4), (401, 2), (410, 4), (413, 2), (414, 4), (418, 2), (419, 2), (423, 6), (424, 2), (433, 12), (434, 2), (441, 1), (444, 4), (445, 2), (448, 4), (452, 6), (458, 8), (462, 2), (467, 2), (475, 22)]
able
ect. 

Saving Models

https://www.youtube.com/watch?v=xADAr8pPQMI&list=PL2VXyKi-KpYttggRATQVmgFcQst3z6OlX&index=12

In [136]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=30,
                                           alpha="auto")


In [132]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.23957382), (1, 0.114857174), (2, 0.068196125), (3, 0.012150145), (4, 0.22865918), (5, 0.036370933), (7, 0.024131227), (8, 0.035303768), (9, 0.046170343), (10, 0.037606623), (11, 0.01454134), (12, 0.049493156), (15, 0.017285148), (16, 0.019996101), (18, 0.055496957)]
[(0, 0.23957382), (4, 0.22865918), (1, 0.114857174), (2, 0.068196125), (18, 0.055496957), (12, 0.049493156), (9, 0.046170343), (10, 0.037606623), (5, 0.036370933), (8, 0.035303768), (7, 0.024131227), (16, 0.019996101), (15, 0.017285148), (11, 0.01454134), (3, 0.012150145)]


In [133]:
lda_model.save("models/test_model.model")

In [134]:
new_model = gensim.models.ldamodel.LdaModel.load("models/test_model.model")

In [135]:
test_doc = corpus[-1]

vector = new_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.2395575), (1, 0.11485863), (2, 0.068188906), (3, 0.0121496385), (4, 0.22869575), (5, 0.036369193), (7, 0.024140326), (8, 0.035302002), (9, 0.046137094), (10, 0.037605748), (11, 0.014545072), (12, 0.049502864), (15, 0.017285211), (16, 0.020000387), (18, 0.055493727)]
[(0, 0.2395575), (4, 0.22869575), (1, 0.11485863), (2, 0.068188906), (18, 0.055493727), (12, 0.049502864), (9, 0.046137094), (10, 0.037605748), (5, 0.036369193), (8, 0.035302002), (7, 0.024140326), (16, 0.020000387), (15, 0.017285211), (11, 0.014545072), (3, 0.0121496385)]
