<a href="https://colab.research.google.com/github/andac-demir/Topic-Visualization/blob/master/topicVis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Necessary Modules

In [0]:
# Scikit Learn Dataset Loader
from sklearn.datasets import fetch_20newsgroups

# Natural Language Tookit
import nltk

# Stop words list from NLTK
from nltk.corpus import stopwords

# Tokenizer from NLTK
from nltk.tokenize import RegexpTokenizer

# Lemmatizer and Stemmer from NLTK
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# List of punctuation from string
from string import punctuation

# Numpy/plotting
import matplotlib.pyplot as plt
import numpy as np
import scipy

# Pandas
import pandas as pd

In [0]:
# Run NLTK download if not run yet
nltk.download(info_or_id="popular")

# Download Dataset

In [0]:
dataset = fetch_20newsgroups(subset='all', shuffle=True)
dataset_body_only = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))

In [0]:
dataset.data[0]

# Part 1
Preprocess 20 Newsgroup dataset as corpus and visualize its statistical information

In [0]:
# Utility functions for cleaning up text
# Note: All apply to the same level of text (either a string of words of list of tokenized words)
import itertools
def flatmap(f, items):
  return itertools.chain.from_iterable(map(f, items))

import re

def remove_emails_and_hostnames(text):
  return re.sub('(\S+@\S+|\S+\.\S+\.\S+\.\S+)', '', text)

def split_by_sentence(text):
  return text.split('.')

def split_by_line(text):
  return text.splitlines()

def remove_punctuation(text):
  return "".join([c for c in text if c not in punctuation])

tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
def tokenize(text):
  return tokenizer.tokenize(text)

def lower(text):
  return text.lower()

def remove_stopwords(text):
  return filter(lambda w : w not in stopwords.words('english'), text)

lemmatizer = WordNetLemmatizer()
def lemmatize(text):
  return map(lemmatizer.lemmatize, text)

stemmer = PorterStemmer()
def stem(text):
  return map(stemmer.stem, text)

def rejoin_words(text):
  return " ".join(text)

In [0]:
rejoin_words(lemmatize(remove_stopwords(tokenize(lower(remove_punctuation(dataset.data[0]))))))

In [0]:
# Apply text cleanup to bundle, in order
def cleanup_text(text):
  return rejoin_words(lemmatize(remove_stopwords(tokenize(lower(remove_punctuation(text))))))

def cleanup_text_stemmer(text):
  return rejoin_words(stem(remove_stopwords(tokenize(lower(remove_punctuation(text))))))

cleaned_dataset_by_document_iter = map(cleanup_text, map(remove_emails_and_hostnames, dataset.data))
cleaned_dataset_by_sentence_iter = filter(lambda s : s and not s.isspace(), map(cleanup_text, flatmap(split_by_sentence, map(remove_emails_and_hostnames, dataset.data))))

In [0]:
# Compare cleaned and uncleaned datasets!

#print("ORIGINAL:")
#print(repr(dataset.data[0]))
#print("CLEANED:")
#print(next(cleaned_dataset_by_document_iter))
#print("CLEANED BY SENTENCE:")
#print(next(cleaned_dataset_by_sentence_iter))

In [0]:
# Load sentence length data into memory
sentence_lengths_iter = map(len, map(lambda s : s.split(" "), cleaned_dataset_by_sentence_iter))
sentence_lengths_array = np.fromiter(sentence_lengths_iter, dtype=int)

In [0]:
# Filter out outliers
avg = np.average(sentence_lengths_array)
std_dev = np.std(sentence_lengths_array)
keep_condition = np.abs(sentence_lengths_array - avg) < 2*std_dev
sentence_lengths_array_filtered = np.extract(keep_condition, sentence_lengths_array)

In [0]:
# Describe sentence length distribution
scipy.stats.describe(sentence_lengths_array_filtered)

In [0]:
# Sentence length histogram
plt.figure(figsize=(16, 12))
n, bins, patches = plt.hist(sentence_lengths_array_filtered, bins=np.max(sentence_lengths_array_filtered))

In [0]:
# Remove sentence_lengths_array from memory
del(sentence_lengths_array)

# Part 2
Build two different vocabularies upon different preprocessing ways; Learn Bag-of-words (BoW) and TF-IDF model with each vocabulary accordingly

In [0]:
# Define two different cleaned corpuses
corpus1_iter = map(cleanup_text, map(remove_emails_and_hostnames, dataset.data))
corpus2_iter = map(cleanup_text, map(remove_emails_and_hostnames, dataset_body_only.data))

In [0]:
# Bag-Of-Words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer1 = CountVectorizer()
bow1 = vectorizer1.fit_transform(corpus1_iter)

vectorizer2 = CountVectorizer()
bow2 = vectorizer2.fit_transform(corpus2_iter)

# Bag-Of-Words 1 Information

In [0]:
print("Vocab length: {}".format(len(vectorizer1.vocabulary_)))

In [0]:
N_MOST_COMMON = 200
counts = np.sort(np.sum(bow1, axis=0))[0,::-1][0,0:N_MOST_COMMON].transpose()
bar_positions = np.arange(len(counts))
plt.figure(figsize=(16, 12))
plt.plot(bar_positions, counts)
plt.ylabel("# of occurences")

In [0]:
# Most common words:
indices = np.sum(bow1, axis=0).argsort().transpose()[-10:][::-1]
print(np.array(vectorizer1.get_feature_names())[indices])

# Bag-Of-Words 2 Information

In [0]:
print("Vocab length: {}".format(len(vectorizer2.vocabulary_)))

In [0]:
N_MOST_COMMON = 200
counts = np.sort(np.sum(bow2, axis=0))[0,::-1][0,0:N_MOST_COMMON].transpose()
bar_positions = np.arange(len(counts))
plt.figure(figsize=(16, 12))
plt.plot(bar_positions, counts)
plt.ylabel("# of occurences")

In [0]:
# Most common words:
indices = np.sum(bow2, axis=0).argsort().transpose()[-20:][::-1]
print(np.array(vectorizer2.get_feature_names())[indices])

# TF-IDF

In [0]:
# Imports
from sklearn.feature_extraction.text import TfidfTransformer

## TF-IDF 1 Information

## TF-IDF 2 Information

In [0]:
transformer2 = TfidfTransformer()
tf_idf2 = transformer2.fit_transform(bow2)
idf2_df = pd.DataFrame(transformer2.idf_, index=vectorizer2.get_feature_names(),columns=["idf_weights"])
idf2_df.sort_values(by=['idf_weights']).head(10)

In [0]:
# TF_IDF info for each document is in tf_idf2
# Looking at first document:
print(dataset_body_only.data[0])

first_document_tf_idf_df2 = pd.DataFrame(tf_idf2[0].T.todense(), index=vectorizer2.get_feature_names(), columns=["tf-idf value"])
first_document_tf_idf_df2.sort_values(by=["tf-idf value"], ascending=False).head(5)

In [0]:
# Looking at the second document
print(dataset_body_only.data[1])

second_document_tf_idf_df2 = pd.DataFrame(tf_idf2[1].T.todense(), index=vectorizer2.get_feature_names(), columns=["tf-idf value"])
second_document_tf_idf_df2.sort_values(by=["tf-idf value"], ascending=False).head(5)

In [0]:
top_words = []
for i, name in enumerate(dataset_body_only.target_names):
  top_words.append(np.array(vectorizer2.get_feature_names())[tf_idf2[dataset_body_only.target == i].mean(axis=0).argsort().T[-10:][::-1]])

top_words_df = pd.DataFrame(np.array(top_words).squeeze().T, columns=dataset_body_only.target_names)

In [0]:
top_words_df

# Part 3 LDA Models

Train two LDA models upon the vocabularies in Step 2; Visualize topics with four different
methods; and eventually get the topic distribution (as feature) for each document.

## 3.1. LDA Model with Bag of Words

In [0]:
# Define two different cleaned corpuses
corpus1_iter = map(cleanup_text, map(remove_emails_and_hostnames, dataset.data))
corpus2_iter = map(cleanup_text, map(remove_emails_and_hostnames, dataset_body_only.data))

In [0]:
data_lemmatized = []
for _, Str in enumerate(corpus2_iter):
  data_lemmatized.append(Str.split())

In [0]:
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
import gensim.corpora as corpora
!pip install pyLDAvis
import pyLDAvis
from pyLDAvis.gensim import prepare
import warnings

id2word = corpora.Dictionary(data_lemmatized)
# filter out tokens that appear in less than 15 documents:
id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [id2word.doc2bow(doc) for doc in data_lemmatized]

In [0]:
from gensim.models import LdaMulticore

with warnings.catch_warnings():
  warnings.filterwarnings("ignore",category=DeprecationWarning)
  lda_model_bow = LdaMulticore(bow_corpus, num_topics=10, 
                               id2word=id2word, passes=2, workers=2)

## PyLDAVis

In [0]:
# visualize the information contained in the topic model

import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary=id2word)
vis

## t-SNE Clustering


In [0]:
# Get topic weights and dominant topics ------------
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model_bow[bow_corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 10
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)

## Word Clouds

In [0]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 
                   'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 
                   'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 
                   'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 
                   'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model_bow.show_topics(formatted=False)

fig, axes = plt.subplots(2, 5, figsize=(20,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

## 3.2. LDA Model with TF-IDF

In [0]:
from gensim.models import TfidfModel

tfidf = TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

with warnings.catch_warnings():
  warnings.filterwarnings("ignore",category=DeprecationWarning)
  lda_model_tfidf = LdaMulticore(corpus_tfidf, num_topics=10, id2word=id2word, passes=2, workers=4)

## PyLDAVis

In [0]:
# visualize the information contained in the topic model

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary=id2word)
vis

## t-SNE Clustering

In [0]:
# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model_bow[bow_corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 10
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)

## Word Clouds

In [0]:
topics = lda_model_tfidf.show_topics(formatted=False)

fig, axes = plt.subplots(2, 5, figsize=(20,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

# Part 4 Doc2Vec
Train two Doc2Vec models upon the vocabularies in Step 2; Visualize your learned word and document embedding space; Collect Doc2Vec representation of each document.

In [0]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [0]:
# Load documents into memory
gensim_mapped_documents = [TaggedDocument(d.split(" "), [i]) for i, d in enumerate(corpus2_iter)]

In [0]:
WINDOW_SIZE = 4

model = Doc2Vec(window=WINDOW_SIZE, epochs=25, seed=42)
model.build_vocab(gensim_mapped_documents)
model.train(gensim_mapped_documents, total_examples=model.corpus_count, epochs=model.epochs)

In [0]:
doc2vec_array = np.array([model.docvecs[i] for i in range(len(gensim_mapped_documents))])

In [0]:
# Doing PCA of Doc2Vec data
from sklearn.decomposition import PCA

In [0]:
# Try standardizing the features?
from sklearn.preprocessing import StandardScaler
standardized_doc2vec_array = StandardScaler().fit_transform(doc2vec_array)

In [0]:
pca = PCA(n_components=2)
components = pca.fit_transform(standardized_doc2vec_array)

In [0]:
principle_df = pd.DataFrame(components, columns=["c1", "c2"])

In [0]:
principle_df_with_targets = pd.concat([principle_df, pd.DataFrame(dataset_body_only.target, columns=["target"])], axis=1)

In [0]:
fig = plt.figure(figsize = (16,16))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = dataset_body_only.target_names
colors = plt.cm.rainbow(np.linspace(0,1,len(targets)))
for target, color in zip(range(len(targets)),colors):
    indicesToKeep = principle_df_with_targets['target'] == target
    ax.scatter(principle_df_with_targets.loc[indicesToKeep, 'c1'], principle_df_with_targets.loc[indicesToKeep, 'c2'], c = [color], s = 50)
ax.legend(targets)
ax.grid()

In [0]:
# Hm, that is weird. What about LDA?
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [0]:
lda = LinearDiscriminantAnalysis(n_components=2)
components = lda.fit_transform(doc2vec_array, dataset_body_only.target)

In [0]:
lda_principle_df = pd.DataFrame(components, columns=["c1", "c2"])

In [0]:
lda_principle_df_with_targets = pd.concat([principle_df, pd.DataFrame(dataset_body_only.target, columns=["target"])], axis=1)

In [0]:
fig = plt.figure(figsize = (16,16))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component LDA', fontsize = 20)
targets = dataset_body_only.target_names
colors = plt.cm.rainbow(np.linspace(0,1,len(targets)))
for target, color in zip(range(len(targets)),colors):
    indicesToKeep = lda_principle_df_with_targets['target'] == target
    ax.scatter(lda_principle_df_with_targets.loc[indicesToKeep, 'c1'], lda_principle_df_with_targets.loc[indicesToKeep, 'c2'], c = [color], s = 50)
ax.legend(targets)
ax.grid()

In [0]:
# Hm, maybe just comparing a couple of the targets?
# Automobiles vs Baseball vs Hockey

doc2vec_array_sports = doc2vec_array[np.logical_or(np.logical_or(dataset_body_only.target == 8, dataset_body_only.target == 9), dataset_body_only.target == 10)]
targets_sports = dataset_body_only.target[np.logical_or(np.logical_or(dataset_body_only.target == 8, dataset_body_only.target == 9), dataset_body_only.target == 10)]
lda = LinearDiscriminantAnalysis(n_components=2)
components = lda.fit_transform(doc2vec_array_sports, targets_sports)
principle_df = pd.DataFrame(components, columns=["c1", "c2"])
principle_df_with_targets = pd.concat([principle_df, pd.DataFrame(targets_sports, columns=["target"])], axis=1)


fig = plt.figure(figsize = (16,16))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [dataset_body_only.target_names[8], dataset_body_only.target_names[9], dataset_body_only.target_names[10]]
colors = ['r', 'b', 'y']
for target, color in zip(range(len(targets)),colors):
    indicesToKeep = principle_df_with_targets['target'] == target + 8
    ax.scatter(principle_df_with_targets.loc[indicesToKeep, 'c1'], principle_df_with_targets.loc[indicesToKeep, 'c2'], c = [color], s = 50)
ax.legend(targets)
ax.grid()

In [0]:
# Inverse LDA transform:
def inverse_lda(lda, x):
  inv = np.linalg.pinv(lda.scalings_)
  return np.dot(x, inv) + lda.xbar_

# Direction of new axes:
hockey = inverse_lda(lda, np.array([3, 7]))
baseball = inverse_lda(lda, np.array([1, -5]))
motorcycles = inverse_lda(lda, np.array([-5, 0]))

In [0]:
model.wv.similar_by_vector(hockey)

In [0]:
model.wv.similar_by_vector(baseball)

In [0]:
model.wv.similar_by_vector(motorcycles)