#`5430 NLP | SPRING 2021 | ASSIGNMENT 8 | UNI: CHB2132 `#
---

**A. Write a Python program based on the Week 8 class exercises, which:**

* **Implement:** LDA training and topic modeling on dataset of deduplicated Webhose feeds
  * *Modify min_df, max_df, max_features and max_iter (sklearn) to achieve the best results*

* **Submit:** Code + Output a set of n topic clusters with up to 10 keywords per cluster
   * *Clusters should not overlap, and keywords be should allowed to approximate the meaning*

#### *Library imports and utility text cleanup function*
---

In [None]:
!pip install sklearn
!pip install pyLDAvis
!pip install ipython==7.10.0

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import matplotlib
import pandas as pd
import json, re, requests

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
import nltk
import string

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

In [None]:
stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation)

#### Read dataset of feeds into an array and grab all titles
---

In [None]:
apple_feeds = []

with open('/content/dedupes.json', 'r') as f:
  for line in f.readlines():
    apple_feeds.append(json.loads(line))

feed_titles = [feed['title'] for feed in apple_feeds]
print("Total number of titles: " + str(len(feed_titles)))

In [None]:
# tokenize titles
def tokenize_titles(title):
    tokens = nltk.word_tokenize(title)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

In [None]:
# term-document matrix
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                #tokenizer=tokenize_titles,
                                max_features=500,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 888, 
                                min_df = 2,
                                ngram_range=(1,8))

dtm_tf = tf_vectorizer.fit_transform(feed_titles)

print(dtm_tf.shape)

#### *Cluster with LDA*
---

In [None]:
# clustering w/ LDA with online/real-time learning method
lda_tf = LatentDirichletAllocation(n_components=10, 
                                   max_iter=200,
                                   learning_method='online', 
                                   random_state = 0)

lda_tf.fit(dtm_tf)

In [None]:
topics = dict()
n_top_words = 10

tf_feature_names = tf_vectorizer.get_feature_names()

for topic_idx, topic in enumerate(lda_tf.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [None]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

#### *Find Dominant Topic for All (Or 10 random) Titles*
---

In [None]:
# Create a document to topic matrix
lda_output = lda_tf.transform(dtm_tf)

In [None]:
# column names
topicnames = ['Topic_' + str(i) for i in range(lda_tf.n_components)]

In [None]:
# index names
docnames = ['Doc_' + str(i) for i in range(len(feed_titles))]

In [None]:
import numpy as np

# create dataframe with topicval and dominant topic for each title
feed_topic = pd.DataFrame(np.round(lda_output,2), columns=topicnames, index=docnames)
feed_topic['dominant_topic'] = np.argmax(feed_topic.values, axis=1)

#feed_topic.head()
feed_topic['dominant_topic']