# POLI 179 Final Project
## 2 Decades
### By: Alyson Otañez 

The following code applies an LDA model to subsets of `ie_cities.csv` file found in the `Data` folder to determine the difference in topics between 2 decades of data.

Given lack of older data, I analyze 2 decades: 2003 - 2013 & 2014 - 2024

Topic Plot can be found in the folders - `Plots` -> `LDA_Topic_Visual` -> `Decades`

Matrix for topic differences can be found in the folders - `Plots` -> `LDA_Topic_Difference` -> `Decades`

## Linear Discriminant Analysis (LDA) 

### 1. Setup

In [None]:
# Install packages if necessary
# ! pip install nltk
# ! pip install spacy 
# ! pip install --user gensim
# ! pip install --user pyLDAvis
# ! pip install --user gutenbergpy

In [None]:
# Import necessary packages
import pandas as pd
import os
import nltk
import re
import string
import sys
sys.path.append('/home/aotanez/.local/lib/python3.9/site-packages')
import gensim
import numpy as np
from gutenbergpy import textget
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize
import plotly.graph_objs as go
import plotly.offline as py
import matplotlib.pyplot as plt

In [None]:
# Load data 
ie_cities = pd.read_csv('../Data/ie_cities.csv')

# Drop NA values (only 1)
ie_cities = ie_cities[ie_cities['Text'].notna()]

ie_cities

### 2. Preprocess Data

In [None]:
# WordNet for lemmatization 
def wordnet_pos_tags(x):
    if x.startswith('J'):
        return wordnet.ADJ
    elif x.startswith('V'):
        return wordnet.VERB
    elif x.startswith('N'):
        return wordnet.NOUN
    elif x.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Function for preprocessing 
def txt_preprocess_pipeline(text):
    standard_txt = text.lower()
    
    clean_txt = re.sub(r'http\S+|www\S+|https\S+', '', standard_txt, flags = re.MULTILINE)
    clean_txt = re.sub(r'\n', ' ', clean_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = re.sub(r'\S+@\S+', '', clean_txt)
    clean_txt = re.sub(r'\\r\\n', ' ', clean_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = re.sub(r'<.*?>', '', clean_txt)
    clean_txt = re.sub(r'[^\w\s]', '', clean_txt)    
    clean_txt = re.sub(r'\b\w{1,2}\b', '', clean_txt)
    
    tokens = word_tokenize(clean_txt)
    filtered_tokens_alpha = [word for word in tokens if word.isalpha() and not re.match(r'^[ivxlcdm]+$', word)]
    
    stop_words = set(stopwords.words('english'))
    stop_words.update(['chino', 'fontana', 'march', 'joint', 'powers', 'authority', 
                       'http', 'rialto', 'ontario', 'city', 'council', 'agenda',
                      'meeting', 'minutes'])
    filtered_tokens_final = [w for w in filtered_tokens_alpha if not w in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(filtered_tokens_final)
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]
    
    return lemma_tokens

In [None]:
# Apply functions to data
ie_cities['Processed_Text'] = ie_cities['Text'].apply(txt_preprocess_pipeline)
ie_cities

### 3. Filter Data

In [None]:
# Decade 1 2003 - 2013
dec_1 = ie_cities[(ie_cities['Year'] >= 2003) & (ie_cities['Year'] <= 2013)]
dec_1

# N = 1,122

In [None]:
# Decade 2 2014 - 2024
dec_2 = ie_cities[(ie_cities['Year'] >= 2014) & (ie_cities['Year'] <= 2024)]
dec_2

# N = 4,150

### 4. Train LDA Models

### 4.1 2003 - 2013

In [None]:
# Load dictionary
dictionary_dec1 = corpora.Dictionary(dec_1['Processed_Text'])
dictionary_dec1.filter_extremes(no_below = 2)

# Generate corpus as BoW
corpus_dec1 = [dictionary_dec1.doc2bow(i) for i in dec_1['Processed_Text']]

In [None]:
# Train LDA model
lda_model_dec1 = LdaModel(corpus = corpus_dec1, id2word = dictionary_dec1, random_state = 4583, 
                     chunksize = 20, num_topics = 5, passes = 200, iterations= 400)

# Print LDA topics
for idx, topic in lda_model_dec1.print_topics(num_topics = 5, num_words =10):
    print(f"Topic {idx+1}: {topic}")

In [None]:
# Visualization
dickens_visual_dec1 = gensimvisualize.prepare(lda_model_dec1, corpus_dec1, dictionary_dec1, mds='mmds')
pyLDAvis.save_html(dickens_visual_dec1, 'lda_decade1_visualization.html')

In [None]:
# Plot
pyLDAvis.display(dickens_visual_dec1)

### 4.2 2014 - 2024

In [None]:
# Load dictionary
dictionary_dec2 = corpora.Dictionary(dec_2['Processed_Text'])
dictionary_dec2.filter_extremes(no_below = 2)

# Generate corpus as BoW
corpus_dec2 = [dictionary_dec2.doc2bow(i) for i in dec_2['Processed_Text']]

In [None]:
# Train LDA model
lda_model_dec2 = LdaModel(corpus = corpus_dec2, id2word = dictionary_dec2, random_state = 4583, 
                     chunksize = 20, num_topics = 5, passes = 200, iterations= 400)

# Print LDA topics
for idx, topic in lda_model_dec2.print_topics(num_topics = 5, num_words =10):
    print(f"Topic {idx+1}: {topic}")

In [None]:
# Visualization
dickens_visual_dec2 = gensimvisualize.prepare(lda_model_dec2, corpus_dec2, dictionary_dec2, mds='mmds')
pyLDAvis.save_html(dickens_visual_dec2, 'lda_decade2_visualization.html')

In [None]:
# Plot
pyLDAvis.display(dickens_visual_dec2)

### 5. Topic Comparison 

In [None]:
# Plot difference function
## Source: https://radimrehurek.com/gensim/auto_examples/howtos/run_compare_lda.html#:~:text=You%20can%20do%20this%20by%20constructing%20a%20matrix%20with%20the%20difference.&text=Looking%20at%20this%20matrix%2C%20you,the%20topics'%20intersection%20and%20difference.

def plot_difference_plotly(mdiff, title="", annotation=None):
    annotation_html = None
    if annotation is not None:
        annotation_html = [
            [
                "+++ {}<br>--- {}".format(", ".join(int_tokens), ", ".join(diff_tokens))
                for (int_tokens, diff_tokens) in row
            ]
            for row in annotation
        ]

    data = go.Heatmap(z=mdiff, colorscale='RdBu', text=annotation_html)
    layout = go.Layout(width=950, height=950, title=title, xaxis=dict(title="topic"), yaxis=dict(title="topic"))
    fig = go.Figure(data=[data], layout=layout)
    return fig

def plot_difference_matplotlib(mdiff, title="", annotation=None):
    fig, ax = plt.subplots(figsize=(18, 14))
    data = ax.imshow(mdiff, cmap='RdBu_r', origin='lower')
    plt.title(title)
    plt.colorbar(data)
    plt.show()

try:
    get_ipython()
    import plotly.offline as py
except Exception:
    plot_difference = plot_difference_matplotlib
else:
    py.init_notebook_mode()
    plot_difference = plot_difference_plotly

In [None]:
# Difference matrix
mdiff, annotation = lda_model_dec1.diff(lda_model_dec2, distance = 'jaccard', num_words = 50)
diff = plot_difference(mdiff, title= "Topic difference 2003-2013 vs. 2014-2024", annotation=annotation)
py.plot(diff, filename='decade_topic_diff.html')
diff