In [13]:
# download necessary libraries and packages for our topic modeling algorithm
%pip install nltk -U
%pip install spacy -U
%pip install gensim
%pip install pyldavis
%pip install gutenbergpy

import os
import nltk
import re
import string
import gensim
import numpy as np

# for cleaning prefatory matter from Project Gutenberg texts
from gutenbergpy import textget

# for tokenization
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download('wordnet')

# for stopword removal
from nltk.corpus import stopwords
nltk.download('stopwords')

# for lemmatization and POS tagging
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')


# for LDA
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# for LDA evaluation
import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Cont

[nltk_data] Downloading package punkt to /Users/amandalu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/amandalu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amandalu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amandalu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/amandalu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/amandalu/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [2]:
# change this to include whoever the interviewers are (to remove their lines in transcript)
interviewers = {"DAVIS", "MUIR"}

In [19]:
# load WordNet POS tags for lemmatization
def wordnet_pos_tags(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# preprocessing function
def txt_preprocess_pipeline(text):
    # read file contents as text string
    working_txt = text.read()

    # remove moderators' sections
    for interviewer in interviewers: 
        main_txt = re.sub(rf'^{interviewer}.*\n?', '', working_txt, flags=re.MULTILINE)

    # standardize text to lowercase
    standard_txt = main_txt.lower()

    # remove multiple white spaces and line breaks
    clean_txt = re.sub(r'\n', ' ', standard_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = clean_txt.strip()

    # tokenize text
    tokens = word_tokenize(clean_txt)

    # remove non-alphabetic tokens
    filtered_tokens_alpha = [word for word in tokens if word.isalpha() and not re.match(r'^[ivxlcdm]+$', word)]

    # load NLTK stopword list and add original stopwords
    stop_words = stopwords.words('english')
    # add any additional stopwords here
    stop_words.extend(['trump', 'davis', 'harris', 'president', 'vice', 'mr', 'mrs', 'ms', 'dr'])
    # remove stopwords
    filtered_tokens_final = [w for w in filtered_tokens_alpha if not w in stop_words]

    # define lemmatizer
    lemmatizer = WordNetLemmatizer()

    # conduct POS tagging
    pos_tags = nltk.pos_tag(filtered_tokens_final)

    # lemmatize word-tokens via assigned POS tags
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]
    return lemma_tokens

# file iteration function
def iterate_txt_files(txt_dir):
    texts = []
    for filename in os.listdir(txt_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(txt_dir, filename), 'r', encoding='utf-8') as file:
                txt_tokens = txt_preprocess_pipeline(file)
                texts.append(txt_tokens)
    return texts

In [35]:
# define working directory
work_dir = os.getcwd()
# specify path to text corpus inside working directory
file_dir = f'{work_dir}/data/debate'

# iterate through each text
texts = iterate_txt_files(file_dir)
# print first processed text
for text in texts: 
    print(text)

['last', 'run', 'say', 'wanted', 'ban', 'fracking', 'want', 'mandatory', 'government', 'buyback', 'program', 'assault', 'weapon', 'campaign', 'say', 'support', 'decriminalize', 'border', 'crossing', 'take', 'hard', 'line', 'know', 'say', 'value', 'change', 'many', 'policy', 'position', 'change', 'value', 'change', 'go', 'discus', 'every', 'one', 'least', 'every', 'point', 'make', 'particular', 'let', 'talk', 'fracking', 'pennsylvania', 'make', 'clear', 'ban', 'fracking', 'ban', 'fracking', 'united', 'state', 'fact', 'vote', 'inflation', 'reduction', 'act', 'open', 'new', 'lease', 'fracking', 'position', 'get', 'invest', 'diverse', 'source', 'energy', 'reduce', 'reliance', 'foreign', 'oil', 'large', 'increase', 'domestic', 'oil', 'production', 'history', 'approach', 'recognize', 'rely', 'foreign', 'oil', 'relate', 'value', 'let', 'tell', 'grow', 'kid', 'raise', 'mother', 'work', 'save', 'able', 'buy', 'first', 'home', 'teenager', 'value', 'bring', 'importance', 'home', 'ownership', 'kno

In [68]:
# load dictionary
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below = 5, no_above= .9)
corpuses = []
models = []
# generate corpus as BoW
for text in texts:
    corpus = [dictionary.doc2bow(text)]
    corpuses.append(corpus)

    # train LDA model
    # TODO: tweak parameters for better results
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, chunksize=20, num_topics=3, passes=200, iterations=400)
    models.append(lda_model)
    
    # print LDA topics
    for topic in lda_model.print_topics(num_topics=7, num_words=10):
        print(topic)

    print ("\n")

(0, '0.007*"kill" + 0.007*"invest" + 0.007*"need" + 0.007*"money" + 0.007*"leave" + 0.007*"last" + 0.007*"new" + 0.007*"issue" + 0.007*"happen" + 0.007*"history"')
(1, '0.007*"kill" + 0.007*"invest" + 0.007*"need" + 0.007*"money" + 0.007*"leave" + 0.007*"last" + 0.007*"new" + 0.007*"issue" + 0.007*"happen" + 0.007*"history"')
(2, '0.034*"work" + 0.028*"many" + 0.028*"home" + 0.023*"change" + 0.023*"give" + 0.023*"talk" + 0.023*"want" + 0.023*"million" + 0.023*"back" + 0.018*"first"')


(0, '0.033*"talk" + 0.028*"would" + 0.028*"crime" + 0.028*"come" + 0.023*"former" + 0.023*"criminal" + 0.023*"allow" + 0.020*"happen" + 0.020*"like" + 0.017*"think"')
(1, '0.007*"also" + 0.007*"high" + 0.007*"course" + 0.007*"fact" + 0.007*"finish" + 0.007*"ca" + 0.007*"biden" + 0.007*"give" + 0.007*"believe" + 0.007*"ask"')
(2, '0.007*"also" + 0.007*"high" + 0.007*"course" + 0.007*"fact" + 0.007*"finish" + 0.007*"ca" + 0.007*"biden" + 0.007*"give" + 0.007*"believe" + 0.007*"ask"')


(0, '0.007*"security

In [67]:
dickens_visual = gensimvisualize.prepare(models[4], corpuses[4], dictionary, mds='mmds')
pyLDAvis.display(dickens_visual)

In [64]:
coherence_model = CoherenceModel(model=models[2], texts=text[2], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(coherence_score)



nan
