# Topic modelling

In [15]:
import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation
import nltk

In [2]:
data = pd.read_csv('abcnews-date-text.csv')
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

## Preprocess

In [3]:
# Load Dataset
df = pd.read_csv('abcnews-date-text.csv')
docs= df['headline_text']
df.head()


Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [None]:
# Download the stopwords list if you haven't already
nltk.download('stopwords')

0          aba decides against community broadcasting lic...
1             act fire witnesses must be aware of defamation
2             a g calls for infrastructure protection summit
3                   air nz staff in aust strike for pay rise
4              air nz strike to affect australian travellers
                                 ...                        
1244179    two aged care residents die as state records 2...
1244180    victoria records 5;919 new cases and seven deaths
1244181      wa delays adopting new close contact definition
1244182    western ringtail possums found badly dehydrate...
1244183    what makes you a close covid contact here are ...
Name: headline_text, Length: 1244184, dtype: object

In [1]:
# tokenization
first_text_list = nltk.word_tokenize(docs)
#Stop word removal
#stopwords = nltk.corpus.stopwords.words('english')
# Lemmatization


## Latent Dirichlet Allocation 

In [4]:
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(docs) 



In [6]:
# Define the number of topics or components
num_components=5

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(train_data)

# Get Components 
lda_components=model.components_

In [13]:
# Print the topics with their terms
terms = tfidf.get_feature_names_out()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['says', 'election', 'new', 'interview', 'labor', 'minister', 'pm']
Topic 1:  ['govt', 'water', 'new', 'council', 'market', 'nsw', 'plan']
Topic 2:  ['win', 'drum', 'tigers', 'final', 'league', 'sydney', 'strike']
Topic 3:  ['interview', 'world', 'australia', 'cup', 'day', 'new', 'test']
Topic 4:  ['police', 'man', 'crash', 'charged', 'court', 'murder', 'car']


## Topic Visualization

In [19]:
from sklearn.manifold import TSNE
tsne_lsa_model = TSNE(n_components=2, perplexity=50, learning_rate=100, 
                        n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsne_lsa_vectors = tsne_lsa_model.fit_transform(lda_matrix)


[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1244184 samples in 9.380s...
[t-SNE] Computed neighbors for 1244184 samples in 289.939s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1244184
[t-SNE] Computed conditional probabilities for sample 2000 / 1244184
[t-SNE] Computed conditional probabilities for sample 3000 / 1244184
[t-SNE] Computed conditional probabilities for sample 4000 / 1244184
[t-SNE] Computed conditional probabilities for sample 5000 / 1244184
[t-SNE] Computed conditional probabilities for sample 6000 / 1244184
[t-SNE] Computed conditional probabilities for sample 7000 / 1244184
[t-SNE] Computed conditional probabilities for sample 8000 / 1244184
[t-SNE] Computed conditional probabilities for sample 9000 / 1244184
[t-SNE] Computed conditional probabilities for sample 10000 / 1244184
[t-SNE] Computed conditional probabilities for sample 11000 / 1244184
[t-SNE] Computed conditional probabilities for sample 12000 / 1244184
[t-SNE] Computed co

KeyboardInterrupt: 

In [None]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ])
colormap = colormap[:n_topics]

top_3_words_lsa = get_top_n_words(3, lsa_keys, small_document_term_matrix, small_count_vectorizer)
lsa_mean_topic_vectors = get_mean_topic_vectors(lsa_keys, tsne_lsa_vectors)

plot = figure(title="t-SNE Clustering of {} LSA Topics".format(n_topics), plot_width=700, plot_height=700)
plot.scatter(x=tsne_lsa_vectors[:,0], y=tsne_lsa_vectors[:,1], color=colormap[lsa_keys])

for t in range(n_topics):
    label = Label(x=lsa_mean_topic_vectors[t][0], y=lsa_mean_topic_vectors[t][1], 
                  text=top_3_words_lsa[t], text_color=colormap[t])
    plot.add_layout(label)
    
show(plot)

## Optimal number of topics selection
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4597325/
https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html
https://stackoverflow.com/questions/17421887/how-to-determine-the-number-of-topics-for-lda?rq=4
https://investigate.ai/text-analysis/choosing-the-right-number-of-topics-for-a-scikit-learn-topic-model/
https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py


## How do these topics evolve through time in the ABC news headlines?