# POLI 179 Final Project
## LDA of the entire corpus 
### By: Alyson Otañez 

## Linear Discriminant Analysis (LDA) 

### 1. Setup

In [1]:
# Install packages if necessary
# ! pip install nltk
# ! pip install spacy 
# ! pip install --user gensim
# ! pip install --user pyLDAvis
# ! pip install --user gutenbergpy

In [2]:
# Import necessary packages
import pandas as pd
import os
import nltk
import re
import string
import sys
sys.path.append('/home/aotanez/.local/lib/python3.9/site-packages')
import gensim
import numpy as np
from gutenbergpy import textget
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize

[nltk_data] Downloading package punkt to /home/aotanez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aotanez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aotanez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aotanez/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### 2. Import data

In [3]:
# Load data 
ie_cities = pd.read_csv('../Data/ie_cities.csv')

# Drop NA values (only 1)
ie_cities = ie_cities[ie_cities['Text'].notna()]

ie_cities

Unnamed: 0,Text,Date,Year,City
0,«Back to Main Site\r\n Welc...,"December 20, 2005",2005,Chino
1,«Back to Main Site\r\n Welc...,"January 17, 2006",2006,Chino
2,«Back to Main Site\r\n Welc...,"March 9, 2006",2006,Chino
3,«Back to Main Site\r\n Welc...,"February 21, 2006",2006,Chino
4,«Back to Main Site\r\n Welc...,"March 7, 2006",2006,Chino
...,...,...,...,...
5521,"Tuesday, September 11, 2012\n5:00 PMCity of Ri...","Tuesday, September 11, 2012",2012,Rialto
5522,"Tuesday, August 28, 2012\n5:00 PMCity of Rialt...","Tuesday, August 28, 2012",2012,Rialto
5523,"Tuesday, August 14, 2012\n5:00 PMCity of Rialt...","Tuesday, August 14, 2012",2012,Rialto
5524,"Tuesday, July 24, 2012\n5:00 PMCity of Rialto\...","Tuesday, July 24, 2012",2012,Rialto


### 3. Preprocess data

In [4]:
# WordNet for lemmatization 
def wordnet_pos_tags(x):
    if x.startswith('J'):
        return wordnet.ADJ
    elif x.startswith('V'):
        return wordnet.VERB
    elif x.startswith('N'):
        return wordnet.NOUN
    elif x.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
# Function for preprocessing 
def txt_preprocess_pipeline(text):
    standard_txt = text.lower()
    
    clean_txt = re.sub(r'http\S+|www\S+|https\S+', '', standard_txt, flags = re.MULTILINE)
    clean_txt = re.sub(r'\n', ' ', clean_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = re.sub(r'\S+@\S+', '', clean_txt)
    clean_txt = re.sub(r'\\r\\n', ' ', clean_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = re.sub(r'<.*?>', '', clean_txt)
    clean_txt = re.sub(r'[^\w\s]', '', clean_txt)    
    clean_txt = re.sub(r'\b\w{1,2}\b', '', clean_txt)
    
    tokens = word_tokenize(clean_txt)
    filtered_tokens_alpha = [word for word in tokens if word.isalpha() and not re.match(r'^[ivxlcdm]+$', word)]
    
    stop_words = set(stopwords.words('english'))
    stop_words.update(['chino', 'fontana', 'march', 'joint', 'powers', 'authority', 
                       'http', 'rialto', 'ontario', 'city', 'council', 'agenda',
                      'meeting', 'minutes', 'back', 'site', 'main', 'welcome', 'browse', 'video',
                      'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 
                      'saturday', 'sunday', 'notice', 'commission', 'archive', 'pmcity',
                      'chamber', 'palm', 'ave', 'january', 'february', 'march', 'april', 'may',
                      'june', 'july', 'august', 'september', 'october', 'november', 'december',
                      'closed', 'session'])
    filtered_tokens_final = [w for w in filtered_tokens_alpha if not w in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(filtered_tokens_final)
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]
    
    return lemma_tokens

In [10]:
# Apply functions to data
ie_cities['Processed_Text'] = ie_cities['Text'].apply(txt_preprocess_pipeline)
ie_cities

Unnamed: 0,Text,Date,Year,City,Processed_Text
0,«Back to Main Site\r\n Welc...,"December 20, 2005",2005,Chino,"[meeting, notice, commission, log, help, regis..."
1,«Back to Main Site\r\n Welc...,"January 17, 2006",2006,Chino,"[meeting, notice, commission, log, help, regis..."
2,«Back to Main Site\r\n Welc...,"March 9, 2006",2006,Chino,"[meeting, notice, commission, log, help, regis..."
3,«Back to Main Site\r\n Welc...,"February 21, 2006",2006,Chino,"[meeting, notice, commission, log, help, regis..."
4,«Back to Main Site\r\n Welc...,"March 7, 2006",2006,Chino,"[meeting, notice, commission, log, help, regis..."
...,...,...,...,...,...
5521,"Tuesday, September 11, 2012\n5:00 PMCity of Ri...","Tuesday, September 11, 2012",2012,Rialto,"[chamber, chamber, mayor, grace, vargas, mayor..."
5522,"Tuesday, August 28, 2012\n5:00 PMCity of Rialt...","Tuesday, August 28, 2012",2012,Rialto,"[chamber, mayor, grace, vargas, mayor, pro, te..."
5523,"Tuesday, August 14, 2012\n5:00 PMCity of Rialt...","Tuesday, August 14, 2012",2012,Rialto,"[chamber, mayor, grace, vargas, mayor, pro, te..."
5524,"Tuesday, July 24, 2012\n5:00 PMCity of Rialto\...","Tuesday, July 24, 2012",2012,Rialto,"[chamber, mayor, grace, vargas, mayor, pro, te..."


### 4. LDA Topics

In [11]:
# Load dictionary
dictionary = corpora.Dictionary(ie_cities['Processed_Text'])
dictionary.filter_extremes(no_below = 2)

# Generate corpus as BoW
corpus = [dictionary.doc2bow(i) for i in  ie_cities['Processed_Text']]

In [13]:
# Train LDA model
lda_model = LdaModel(corpus = corpus, id2word = dictionary, random_state = 4583, 
                     chunksize = 20, num_topics = 7, passes = 200, iterations= 400)

# Print LDA topics
for idx, topic in lda_model.print_topics(num_topics = 7, num_words =10):
    print(f"Topic {idx+1}: {topic}")

Topic 1: 0.012*"zone" + 0.011*"environmental" + 0.011*"condition" + 0.011*"building" + 0.010*"airport" + 0.009*"shall" + 0.009*"foot" + 0.008*"standard" + 0.007*"permit" + 0.007*"fire"
Topic 2: 0.086*"shall" + 0.019*"owner" + 0.012*"consistent" + 0.012*"set" + 0.011*"provision" + 0.011*"forth" + 0.009*"party" + 0.008*"applicable" + 0.008*"herein" + 0.007*"parcel"
Topic 3: 0.015*"lane" + 0.012*"peak" + 0.011*"yes" + 0.010*"group" + 0.009*"vision" + 0.008*"ghg" + 0.007*"length" + 0.007*"los" + 0.007*"vehh" + 0.007*"flow"
Topic 4: 0.021*"print" + 0.016*"consider" + 0.016*"ordinance" + 0.012*"contract" + 0.011*"authorize" + 0.011*"budget" + 0.009*"tab" + 0.009*"purchase" + 0.009*"add" + 0.008*"fiscal"
Topic 5: 0.012*"water" + 0.011*"exhibit" + 0.008*"would" + 0.007*"resource" + 0.007*"result" + 0.007*"whereas" + 0.007*"hour" + 0.007*"increase" + 0.006*"emission" + 0.005*"unit"
Topic 6: 0.000*"hirtz" + 0.000*"jrregular" + 0.000*"pharris" + 0.000*"cbdg" + 0.000*"hirtzspecial" + 0.000*"neufel

### 5. Evaluate Model

In [None]:
# Evaluate Model
coherence_model = CoherenceModel(model = lda_model, texts = ie_cities['Processed_Text'], 
                                 dictionary = dictionary, coherence = 'c_v')
coherence_score = coherence_model.get_coherence()
print(coherence_score)

### 6. Plot Topics

In [14]:
# Visualization
dickens_visual = gensimvisualize.prepare(lda_model, corpus, dictionary, mds='mmds')
pyLDAvis.save_html(dickens_visual, 'lda_corpus_visualization.html')

In [17]:
# Plot
pyLDAvis.display(dickens_visual)