# POLI 179 Final Project
## Comparing across keywords
### By: Alyson Otañez 

The following code applies an LDA model to subsets of `ie_cities.csv` file found in the `Data` folder to determine the difference in topics based on keyword classification.


Keywords were determined by skimming through the agenda/minutes, and noting the words commonly used to reference industrial, recreation, and transportation issues.

Plot of keywords over time can be found in the folders - `Plots` -> `Keywords_Over_Time`

Topic Plot can be found in the folders - `Plots` -> `LDA_Topic_Visual` -> `Keywords`

Matrix for topic differences can be found in the folders - `Plots` -> `LDA_Topic_Difference` -> `Keywords`

## Exploratory Analysis 

### 1. Setup

In [None]:
# Install packages if necessary 
# ! pip install pandas
# ! pip install re 
# ! pip install matplotlib
# ! pip install seaborn
# ! pip install warnings 

In [None]:
# Import necessary packages
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load data 
ie_cities = pd.read_csv('../Data/ie_cities.csv')

# Drop NA values (only 1)
ie_cities = ie_cities[ie_cities['Text'].notna()]

### 2. Keywords Over Time

In [None]:
# Keywords 
industrial = ['warehouse', 'logistics', 'distribution', 'industrial', 'warehousing']
recreation = ['recreation', 'park', 'green space', 'pool', 'outdoor']
transportation = ['transportation', 'bus', 'public transport', 'transit', 'train']

In [None]:
# Plot of industrial terms over time
def count_industrial(text):
    words = re.split(r'\s+', text.lower())  
    return sum(word in industrial for word in words)

# Apply the function to each text entry and create a new column for sums
ie_cities['Sum_Industrial'] = ie_cities['Text'].apply(count_industrial)

# Group by Year and sum the counts
yearly_counts_in = ie_cities.groupby('Year')['Sum_Industrial'].sum().reset_index()

# Plotting
plt.figure(figsize=(15, 6))
sns.barplot(data=yearly_counts_in, x='Year', y='Sum_Industrial', color='tomato')
plt.xlabel('Year')
plt.ylabel('Count of Industrial Terms')
plt.title('Sum of Mention of Industrial Terms in Inland Empire, CA')
plt.savefig('industrial_terms_plot.png')
plt.show()

In [None]:
# Plot of recreation terms over time
def count_recreation(text):
    words = re.split(r'\s+', text.lower())  
    return sum(word in recreation for word in words)

# Apply the function to each text entry and create a new column for sums
ie_cities['Sum_Recreation'] = ie_cities['Text'].apply(count_recreation)

# Group by Year and sum the counts
yearly_counts_re = ie_cities.groupby('Year')['Sum_Recreation'].sum().reset_index()

# Plotting
plt.figure(figsize=(15, 6))
sns.barplot(data=yearly_counts_re, x='Year', y='Sum_Recreation', color='seagreen')
plt.xlabel('Year')
plt.ylabel('Count of Recreation Terms')
plt.title('Sum of Mention of Recreation Terms in Inland Empire, CA')
plt.savefig('recreation_terms_plot.png')
plt.show()

In [None]:
# Plot of transportation terms over time
def count_trans(text):
    words = re.split(r'\s+', text.lower())  
    return sum(word in transportation for word in words)

# Apply the function to each text entry and create a new column for sums
ie_cities['Sum_Transportation'] = ie_cities['Text'].apply(count_trans)

# Group by Year and sum the counts
yearly_counts_tr = ie_cities.groupby('Year')['Sum_Transportation'].sum().reset_index()

# Plotting
plt.figure(figsize=(15, 6))
sns.barplot(data=yearly_counts_tr, x='Year', y='Sum_Transportation', color='steelblue')
plt.xlabel('Year')
plt.ylabel('Count of Transportation Terms')
plt.title('Sum of Mention of Transportation Terms in Inland Empire, CA')
plt.savefig('trans_terms_plot.png')
plt.show()

## Latent Dirichlet Allocation (LDA) 

### 1. Setup

In [None]:
# Install packages if necessary
# ! pip install nltk
# ! pip install spacy 
# ! pip install --user gensim
# ! pip install --user pyLDAvis
# ! pip install --user gutenbergpy

In [None]:
# Import necessary packages
import pandas as pd
import os
import nltk
import re
import string
import sys
sys.path.append('/home/aotanez/.local/lib/python3.9/site-packages') # Comment out
import gensim
import numpy as np
from gutenbergpy import textget
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize

### 2. Preprocess Data

In [None]:
# WordNet for lemmatization 
def wordnet_pos_tags(x):
    if x.startswith('J'):
        return wordnet.ADJ
    elif x.startswith('V'):
        return wordnet.VERB
    elif x.startswith('N'):
        return wordnet.NOUN
    elif x.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Function for preprocessing 
def txt_preprocess_pipeline(text):
    standard_txt = text.lower()
    
    clean_txt = re.sub(r'http\S+|www\S+|https\S+', '', standard_txt, flags = re.MULTILINE)
    clean_txt = re.sub(r'\n', ' ', clean_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = re.sub(r'\S+@\S+', '', clean_txt)
    clean_txt = re.sub(r'\\r\\n', ' ', clean_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = re.sub(r'<.*?>', '', clean_txt)
    clean_txt = re.sub(r'[^\w\s]', '', clean_txt)    
    clean_txt = re.sub(r'\b\w{1,2}\b', '', clean_txt)
    
    tokens = word_tokenize(clean_txt)
    filtered_tokens_alpha = [word for word in tokens if word.isalpha() and not re.match(r'^[ivxlcdm]+$', word)]
    
    stop_words = set(stopwords.words('english'))
    stop_words.update(['chino', 'fontana', 'march', 'joint', 'powers', 'authority', 
                       'http', 'rialto', 'ontario', 'city', 'council', 'agenda',
                      'meeting', 'minutes', 'back', 'site', 'main', 'welcome', 'browse', 'video',
                      'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 
                      'saturday', 'sunday', 'notice', 'commission', 'archive', 'pmcity',
                      'chamber', 'palm', 'ave', 'january', 'february', 'march', 'april', 'may',
                      'june', 'july', 'august', 'september', 'october', 'november', 'december',
                      'closed', 'session'])
    filtered_tokens_final = [w for w in filtered_tokens_alpha if not w in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(filtered_tokens_final)
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]
    
    return lemma_tokens

In [None]:
# Apply functions to data
ie_cities['Processed_Text'] = ie_cities['Text'].apply(txt_preprocess_pipeline)
ie_cities

### 3. Filter data

In [None]:
# Column to classify text based on max keywords mentioned 
ie_cities['Type'] = ie_cities[['Sum_Industrial', 'Sum_Recreation', 'Sum_Transportation']].idxmax(axis=1)

rename = {'Sum_Industrial': 'Industrial', 
          'Sum_Recreation': 'Recreation', 
          'Sum_Transportation': 'Transportation'}

ie_cities['Type'] = ie_cities['Type'].map(rename)

ie_cities

In [None]:
# Industrial 
industrial = ie_cities[ie_cities['Type'] == 'Industrial']
industrial

# N = 3,157

In [None]:
# Recreation 
recreation = ie_cities[ie_cities['Type'] == 'Recreation']
recreation

# N = 1,795

In [None]:
# Transportation
transportation = ie_cities[ie_cities['Type'] == 'Transportation']
transportation

# N = 573

### 4. Train LDA Models

### 4.1 Industrial

In [None]:
# Load dictionary
dictionary_in = corpora.Dictionary(industrial['Processed_Text'])
dictionary_in.filter_extremes(no_below = 2)

# Generate corpus as BoW
corpus_in = [dictionary_in.doc2bow(i) for i in  industrial['Processed_Text']]

In [None]:
# Train LDA model
lda_model_in = LdaModel(corpus = corpus_in, id2word = dictionary_in, random_state = 4583, 
                     chunksize = 20, num_topics = 5, passes = 200, iterations= 400)

# Print LDA topics
for idx, topic in lda_model_in.print_topics(num_topics = 5, num_words =10):
    print(f"Topic {idx+1}: {topic}")

In [None]:
# Visualization
dickens_visual_in = gensimvisualize.prepare(lda_model_in, corpus_in, dictionary_in, mds='mmds')
pyLDAvis.save_html(dickens_visual_in, 'lda_industrial_visualization.html')

In [None]:
# Plot
pyLDAvis.display(dickens_visual_in)

### 4.2 Recreation

In [None]:
# Load dictionary
dictionary_re = corpora.Dictionary(recreation['Processed_Text'])
dictionary_re.filter_extremes(no_below = 2)

# Generate corpus as BoW
corpus_re = [dictionary_re.doc2bow(i) for i in recreation['Processed_Text']]

In [None]:
# Train LDA model
lda_model_re = LdaModel(corpus = corpus_re, id2word = dictionary_re, random_state = 4583, 
                     chunksize = 20, num_topics = 5, passes = 200, iterations= 400)

# Print LDA topics
for idx, topic in lda_model_re.print_topics(num_topics = 5, num_words =10):
    print(f"Topic {idx+1}: {topic}")

In [None]:
# Visualization
dickens_visual_re = gensimvisualize.prepare(lda_model_re, corpus_re, dictionary_re, mds='mmds')
pyLDAvis.save_html(dickens_visual_re, 'lda_recreation_visualization.html')

In [None]:
# Plot
pyLDAvis.display(dickens_visual_re)

### 4.3 Transportation

In [None]:
# Load dictionary
dictionary_tr = corpora.Dictionary(transportation['Processed_Text'])
dictionary_tr.filter_extremes(no_below = 2)

# Generate corpus as BoW
corpus_tr = [dictionary_tr.doc2bow(i) for i in transportation['Processed_Text']]

In [None]:
# Train LDA model
lda_model_tr = LdaModel(corpus = corpus_tr, id2word = dictionary_tr, random_state = 4583, 
                     chunksize = 20, num_topics = 5, passes = 200, iterations= 400)

# Print LDA topics
for idx, topic in lda_model_tr.print_topics(num_topics = 5, num_words =10):
    print(f"Topic {idx+1}: {topic}")

In [None]:
# Visualization
dickens_visual_tr = gensimvisualize.prepare(lda_model_tr, corpus_tr, dictionary_tr, mds='mmds')
pyLDAvis.save_html(dickens_visual_tr, 'lda_transportation_visualization.html')

In [None]:
# Plot
pyLDAvis.display(dickens_visual_tr)

### 5. Topic Comparison 

In [None]:
# Plot difference function
## Source: https://radimrehurek.com/gensim/auto_examples/howtos/run_compare_lda.html#:~:text=You%20can%20do%20this%20by%20constructing%20a%20matrix%20with%20the%20difference.&text=Looking%20at%20this%20matrix%2C%20you,the%20topics'%20intersection%20and%20difference.

def plot_difference_plotly(mdiff, title="", annotation=None):
    annotation_html = None
    if annotation is not None:
        annotation_html = [
            [
                "+++ {}<br>--- {}".format(", ".join(int_tokens), ", ".join(diff_tokens))
                for (int_tokens, diff_tokens) in row
            ]
            for row in annotation
        ]

    data = go.Heatmap(z=mdiff, colorscale='RdBu', text=annotation_html)
    layout = go.Layout(width=950, height=950, title=title, xaxis=dict(title="topic"), yaxis=dict(title="topic"))
    fig = go.Figure(data=[data], layout=layout)
    return fig

def plot_difference_matplotlib(mdiff, title="", annotation=None):
    fig, ax = plt.subplots(figsize=(18, 14))
    data = ax.imshow(mdiff, cmap='RdBu_r', origin='lower')
    plt.title(title)
    plt.colorbar(data)
    plt.show()

try:
    get_ipython()
    import plotly.offline as py
except Exception:
    plot_difference = plot_difference_matplotlib
else:
    py.init_notebook_mode()
    plot_difference = plot_difference_plotly

In [None]:
# Difference matrix
# Industrial vs. Recreation
mdiff, annotation = lda_model_in.diff(lda_model_re, distance = 'jaccard', num_words = 50)
diff1 = plot_difference(mdiff, title= "Topic difference Industrial vs. Recreation", annotation=annotation)
py.plot(diff1, filename='indrec_topic_diff.html')
diff1

In [None]:
# Difference matrix
# Industrial vs. Transportation
mdiff, annotation = lda_model_in.diff(lda_model_tr, distance = 'jaccard', num_words = 50)
diff2 = plot_difference(mdiff, title= "Topic difference Industrial vs. Transportation", annotation=annotation)
py.plot(diff2, filename='indtra_topic_diff.html')
diff2