# POLI 179 Final Project
## LDA of the entire corpus 
### By: Alyson Otañez 

## Latent Dirichlet Allocation (LDA) 

The following code applies an LDA model to the entire `ie_cities.csv` file found in the `Data` folder.

Topic Plot can be found in the folders - `Plots` -> `LDA_Topic_Visual` -> `Corpus`

### 1. Setup

In [None]:
# Install packages if necessary
# ! pip install nltk
# ! pip install spacy 
# ! pip install --user gensim
# ! pip install --user pyLDAvis
# ! pip install --user gutenbergpy

In [None]:
# Import necessary packages
import pandas as pd
import os
import nltk
import re
import string
import sys
sys.path.append('/home/aotanez/.local/lib/python3.9/site-packages') # Comment out
import gensim
import numpy as np
from gutenbergpy import textget
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize

In [None]:
# Load data 
ie_cities = pd.read_csv('../Data/ie_cities.csv')

# Drop NA values (only 1)
ie_cities = ie_cities[ie_cities['Text'].notna()]

ie_cities

### 2. Preprocess Data

In [None]:
# WordNet for lemmatization 
def wordnet_pos_tags(x):
    if x.startswith('J'):
        return wordnet.ADJ
    elif x.startswith('V'):
        return wordnet.VERB
    elif x.startswith('N'):
        return wordnet.NOUN
    elif x.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Function for preprocessing 
def txt_preprocess_pipeline(text):
    standard_txt = text.lower()
    
    clean_txt = re.sub(r'http\S+|www\S+|https\S+', '', standard_txt, flags = re.MULTILINE)
    clean_txt = re.sub(r'\n', ' ', clean_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = re.sub(r'\S+@\S+', '', clean_txt)
    clean_txt = re.sub(r'\\r\\n', ' ', clean_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = re.sub(r'<.*?>', '', clean_txt)
    clean_txt = re.sub(r'[^\w\s]', '', clean_txt)    
    clean_txt = re.sub(r'\b\w{1,2}\b', '', clean_txt)
    
    tokens = word_tokenize(clean_txt)
    filtered_tokens_alpha = [word for word in tokens if word.isalpha() and not re.match(r'^[ivxlcdm]+$', word)]
    
    stop_words = set(stopwords.words('english'))
    stop_words.update(['chino', 'fontana', 'march', 'joint', 'powers', 'authority', 
                       'http', 'rialto', 'ontario', 'city', 'council', 'agenda',
                      'meeting', 'minutes', 'back', 'site', 'main', 'welcome', 'browse', 'video',
                      'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 
                      'saturday', 'sunday', 'notice', 'commission', 'archive', 'pmcity',
                      'chamber', 'palm', 'ave', 'january', 'february', 'march', 'april', 'may',
                      'june', 'july', 'august', 'september', 'october', 'november', 'december',
                      'closed', 'session'])
    filtered_tokens_final = [w for w in filtered_tokens_alpha if not w in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(filtered_tokens_final)
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]
    
    return lemma_tokens

In [None]:
# Apply functions to data
ie_cities['Processed_Text'] = ie_cities['Text'].apply(txt_preprocess_pipeline)

### 3. Train LDA Model

In [None]:
# Load dictionary
dictionary = corpora.Dictionary(ie_cities['Processed_Text'])
dictionary.filter_extremes(no_below = 2)

# Generate corpus as BoW
corpus = [dictionary.doc2bow(i) for i in  ie_cities['Processed_Text']]

In [None]:
# Train LDA model
lda_model = LdaModel(corpus = corpus, id2word = dictionary, random_state = 4583, 
                     chunksize = 20, num_topics = 7, passes = 200, iterations= 400)

# Print LDA topics
for idx, topic in lda_model.print_topics(num_topics = 7, num_words =10):
    print(f"Topic {idx+1}: {topic}")

### 4. Plot Topics

In [None]:
# Visualization
dickens_visual = gensimvisualize.prepare(lda_model, corpus, dictionary, mds='mmds')
pyLDAvis.save_html(dickens_visual, 'lda_corpus_visualization.html')

In [None]:
# Plot
pyLDAvis.display(dickens_visual)