## **Module 8 Assignment: LDA Topic Modeling with Scikit Learn**

**Import libraries**

In [86]:
import os
import numpy as np
import pandas as pd
import json, re, nltk

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = set(nltk.corpus.stopwords.words('english'))

**Import deduplicated dataset**

In [9]:
os.getcwd()

'/Users/aimeetran/Desktop/COLUMBIA UNIVERSITY /FALL 2020/APAN 5430 APPLIED TEXT & NL ANALYTICS/8. Topic Modeling'

In [20]:
apple_data= []
with open('/Users/aimeetran/Desktop/COLUMBIA UNIVERSITY /FALL 2020/APAN 5430 APPLIED TEXT & NL ANALYTICS/8. Topic Modeling/deduplicated.json', 'r') as f:
    for line in f.readlines():
        apple_data.append(json.loads(line))

In [87]:
feed_titles = [feed['title'] for feed in apple_data[0]]
feed_titles[0:10]

['New iPad Air may come with USB-C not Lightning Port',
 'iOS 14 Will Reportedly Support All iPhone Models Running iOS 13',
 'iPhone Looters Being Tracked – Apple Warns Phone Thiefs',
 'Apple bug exposed user accounts to hackers',
 "French govt's StopCovid tracing app debuts on Google Play store",
 'American Companies Take An Anti-Racism Stand En Masse Amid Countrywide Riots And Protests In America',
 'iOS 13.5.1 vs iOS 13.5.5 beta 1 speed test (Video)',
 'Mr. Ranjeet Sundher reports 60% OF GLOBAL COBALT SUPPLY AT R',
 'Apple TV Users Can Now Enjoy YouTube Kids',
 'Tech giants condemn racial discrimination, George Floyd death']

## **1. Train LDA model to identify topic distribution and keywords**

**1a. Titles Tokenization**

In [23]:
def tokenize_titles(title):
    tokens = nltk.word_tokenize(title)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

**1b. Parameters Selection**

In [88]:
import pandas as pd
d = {'max_features': [500, 800,1000], 'max_df': [100, 100,100],'min_df': [2, 2,3], 'max_iter':[200,200,300]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,max_features,max_df,min_df,max_iter
0,500,100,2,200
1,800,100,2,200
2,1000,100,3,300


**1c. Term_document matrix**

In [89]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                #tokenizer=tokenize_titles,
                                max_features=df['max_features'][0],
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = df['max_df'][0],  #modify
                                min_df = df['min_df'][0], #modify
                                ngram_range=(2,4))
dtm_tf = tf_vectorizer.fit_transform(feed_titles)
print(dtm_tf.shape)

(8020, 500)


**1d. LDA Clustering**

In [59]:
lda_tf = LatentDirichletAllocation(n_components=5, 
                                   max_iter=df['max_iter'][0], #modify
                                   learning_method='online', 
                                   random_state = 0)
lda_tf.fit(dtm_tf)

LatentDirichletAllocation(learning_method='online', max_iter=200,
                          n_components=5, random_state=0)

In [60]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names()

topics = dict()
for topic_idx, topic in enumerate(lda_tf.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic #0:
apple releases | ipad pro | inch macbook pro | inch macbook | apple ios | ios update | stimulus check | macos catalina | releases ios | apple releases ios
Topic #1:
memorial day | shares apple | shares apple nasdaq | shares apple nasdaq aapl | iphone pro | sells shares | sells shares apple | iphone ipad | sign apple | sells shares apple nasdaq
Topic #2:
apple watch series | watch series | aapl shares | nasdaq aapl shares | apple nasdaq aapl shares | apple airpods | exposure notification | airpods pro | google release | apple google release
Topic #3:
contact tracing app | google apple | covid contact | covid contact tracing | samsung galaxy | position apple | martin scorsese | position apple nasdaq aapl | position apple nasdaq | killers flower
Topic #4:
george floyd | apple podcasts | release date | tech giants | tim cook | new apple | apple glass | tracing tech | popular smartphone | coronavirus contact


**Visualization**

In [61]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

**Cluster 2-3 and 4-5 overlapped**

## **Declustering**

In [90]:
tf_vectorizer1 = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                #tokenizer=tokenize_titles,
                                max_features=df1['max_features'][1],
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = df['max_df'][2],  #change parameter
                                min_df = df['min_df'][2], #change parameter
                                ngram_range=(2,4))
dtm_tf1 = tf_vectorizer1.fit_transform(feed_titles)
print(dtm_tf1.shape)

(8020, 1000)


In [91]:
lda_tf1 = LatentDirichletAllocation(n_components=5, 
                                   max_iter=df['max_iter'][2], #change parameter
                                   learning_method='online', 
                                   random_state = 0)
lda_tf1.fit(dtm_tf1)

LatentDirichletAllocation(learning_method='online', max_iter=300,
                          n_components=5, random_state=0)

In [92]:
n_top_words = 10
tf_feature_names = tf_vectorizer1.get_feature_names()

topics = dict()
for topic_idx, topic in enumerate(lda_tf.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic #0:
apple closes | close stores | capital partners llc | capital partners | apple acquires machine learning | cider vinegar | hackers release | defending jacob | galaxy tab | apple confirms
Topic #1:
doubles price | google covid | google covid contact | google covid contact tracing | converter multilingual | google contact tracing | google contact tracing app | conservative bias | google exposure notification | google contact tracing tech
Topic #2:
apple google api | ipad ipad | aapl shares | escobar brother sues | apple car | adc awards | apple smart keyboard ipad | aapl stake | buds air | app gets
Topic #3:
apple memorial day | best iphone deals | apple plans | apple plus | goes live amid debate | firmware ios | dolby atmos music | flower moon | fixes bug | covid apple
Topic #4:
austin campus | apple channels | gal gadot | improve siri | inch review | executive order social media | amid protests | ios apple | fiona apple | apple new


In [93]:
pyLDAvis.sklearn.prepare(lda_tf1, dtm_tf1, tf_vectorizer1)

**Increase max fearure to 1000 and min df to 3, max iter to 300 removed clusters**

### **Trained 2nd model**

In [94]:
max_features = 1000
max_df = 100
min_df = 3
max_iter = 300

In [77]:
def test_lda_model(tf, tf_vectorizer1, num_topics, max_iter, n_top_words):
    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=max_iter, learning_method='batch', learning_offset=10, random_state=1)
    lda.fit(tf)
    tf_feature_names = tf_vectorizer1.get_feature_names()

    topics = dict()
    for topic_idx, topic in enumerate(lda.components_):
        topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]

    return topics

In [78]:
tf_vectorizer = CountVectorizer(max_df=0.15, min_df=0.01, max_features=1000, tokenizer=tokenize_titles, ngram_range=(2, 4))
tf = tf_vectorizer1.fit_transform(feed_titles)

In [79]:
lda = LatentDirichletAllocation(n_components=8, max_iter=300, learning_method='batch', learning_offset=10, random_state=1)
lda_model = lda.fit(tf)

## **2. Applying LDA Model to random 10 articles**

In [80]:
random_train = lda_tf.transform(dtm_tf)

In [81]:
len(random_train)

8020

In [82]:
import numpy as np
index = []
for i in range(len(random_train)):
    if np.max(random_train[i]) > 0.1:
        index.append(i)

In [83]:
import random
#testing out random samples
random10 = random.sample(index,10)
len(random10)
random10

[1962, 2629, 3806, 1154, 3308, 3358, 725, 3245, 7242, 2731]

In [95]:
random10 = [3824,7312,2741,2304,4018,111,3845,5231,7326,1726]

In [96]:
for i in random10:
    print('Title Index: ', i)
    print('Cluster ID:', np.argmax(random_train[i]))
    print('Keywords:', np.array(tf_vectorizer1.get_feature_names())[np.nonzero(dtm_tf1[i,:].toarray())[-1]])
    print('\n')
    

Title Index:  3824
Cluster ID: 2
Keywords: ['apple aapl' 'holdings apple' 'llc million' 'llc million holdings'
 'llc million holdings apple' 'management llc' 'management llc million'
 'million holdings' 'million holdings apple' 'wealth management'
 'wealth management llc']


Title Index:  7312
Cluster ID: 0
Keywords: ['apple ios' 'covid specific' 'covid specific features'
 'specific features']


Title Index:  2741
Cluster ID: 3
Keywords: ['gadot hedy' 'gadot hedy lamarr' 'gal gadot' 'gal gadot hedy'
 'gal gadot hedy lamarr' 'hedy lamarr']


Title Index:  2304
Cluster ID: 3
Keywords: ['gal gadot' 'hedy lamarr' 'hedy lamarr series' 'lamarr series']


Title Index:  4018
Cluster ID: 0
Keywords: ['apple releases' 'apple releases macos' 'apple releases macos catalina'
 'battery health' 'battery health management' 'health management'
 'macos catalina' 'releases macos' 'releases macos catalina']


Title Index:  111
Cluster ID: 4
Keywords: ['airpods studio' 'apple airpods' 'apple airpods studio