<a href="https://colab.research.google.com/github/ah20776/CE807---Assignment/blob/main/Assignment2/CE807_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyLDAvis
!pip install gensim
!pip install spacy



In [2]:
# Load news data set
# remove meta data headers footers and quotes from news dataset

from pprint import pprint
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True,
                            random_state=32,
                            remove=('headers', 'footers', 'qutes'))

#
dataset_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=32, remove=('headers', 'footers', 'qutes'))
dataset_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=32, remove=('headers', 'footers', 'qutes'))
# Check the names of the categories
pprint(dataset.target_names)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [3]:

'''
Loading Gensim and nltk libraries
'''
# pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk
nltk.download('wordnet')
import pandas as pd
stemmer = SnowballStemmer("english")
from nltk.corpus import stopwords
nltk.download('stopwords')
import spacy
from spacy.lang.en.examples import sentences

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [5]:
processed_docs = []

for doc in dataset_train.data:
    processed_docs.append(preprocess(doc))

'''
Preview 'processed_docs'
'''
pprint(processed_docs[:2])

[['real',
  'question',
  'opinion',
  'motorola',
  'processor',
  'run',
  'compar',
  'intel',
  'processor',
  'run',
  'window',
  'recal',
  'convers',
  'run',
  'window',
  'benchmark',
  'speed',
  'know',
  'true',
  'love',
  'hear',
  'technic',
  'data',
  'david'],
 ['current',
  'street',
  'price',
  'follow',
  'relev',
  'tax',
  'simm',
  'simm',
  'refund',
  'possibl',
  'export',
  'recommend',
  'reliabl',
  'supplier']]


In [6]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
bigram = gensim.models.Phrases(processed_docs, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[processed_docs], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
def remove_stopwords(texts):
   return [[word for word in simple_preprocess(str(doc)) 
   if word not in stop_words] for doc in texts]
def make_bigrams(texts):
   return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
   [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
   texts_out = []
   for sent in texts:
      doc = nlp(" ".join(sent))
      texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
   return texts_out
data_words_nostops = remove_stopwords(processed_docs)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=[
   'NOUN', 'ADJ', 'VERB', 'ADV'
])
print(data_lemmatized[:4]) #it will print the lemmatized data.



[['real', 'question', 'opinion', 'window', 'recal', 'conver', 'run', 'window', 'benchmark', 'speed', 'know', 'true', 'love', 'hear', 'technic'], ['current', 'street', 'price', 'follow', 'refund', 'reliabl', 'supplier'], ['help', 'inform', 'card', 'reader', 'recent', 'buy', 'local', 'surplus', 'dealer', 'rear', 'follow', 'inform', 'card', 'reader', 'connector', 'power', 'connector'], ['write', 'sick', 'call', 'legisl', 'unseal', 'involv', 'atroc', 'includ', 'presid', 'attorney_general', 'governor', 'suspend', 'pend', 'serious', 'doubt']]


In [7]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:4]) #it will print the corpus we created above.
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:4]] 
#it will print the words with their frequencies.
lda_model = gensim.models.ldamodel.LdaModel(
   corpus=corpus, id2word=id2word, num_topics=10, random_state=100, 
   update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True
)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2)], [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [(15, 1), (21, 1), (22, 2), (23, 2), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 2), (30, 1), (31, 1), (32, 1)], [(33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1)]]


In [8]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

coherence_model_lda = CoherenceModel(
   model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

[(0,
  '0.053*"window" + 0.045*"card" + 0.031*"driver" + 0.031*"problem" + '
  '0.030*"color" + 0.023*"version" + 0.020*"email" + 0.020*"monitor" + '
  '0.019*"support" + 0.019*"run"'),
 (1,
  '0.030*"peopl" + 0.021*"right" + 0.020*"say" + 0.016*"person" + 0.016*"true" '
  '+ 0.014*"fact" + 0.013*"reason" + 0.012*"argument" + 0.011*"live" + '
  '0.011*"claim"'),
 (2,
  '0.029*"sell" + 0.021*"model" + 0.020*"pain" + 0.018*"car" + 0.014*"mile" + '
  '0.014*"price" + 0.013*"metal" + 0.013*"finger" + 0.013*"food" + '
  '0.011*"clean"'),
 (3,
  '0.036*"write" + 0.032*"know" + 0.023*"think" + 0.020*"time" + 0.016*"go" + '
  '0.015*"good" + 0.014*"look" + 0.014*"come" + 0.014*"want" + 0.012*"work"'),
 (4,
  '0.034*"govern" + 0.025*"kill" + 0.021*"exist" + 0.019*"object" + '
  '0.016*"deal" + 0.015*"attack" + 0.014*"moral" + 0.013*"armenian" + '
  '0.013*"peopl" + 0.011*"state"'),
 (5,
  '0.053*"game" + 0.030*"team" + 0.030*"play" + 0.026*"year" + 0.025*"player" '
  '+ 0.016*"season" + 0.016*"

In [None]:
#to find the best hyperparameters

import numpy as np
import tqdm
from google.colab import files

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 32
step_size = 2
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    files.download('lda_tuning_results.csv')
    pbar.close()

In [13]:
#finding the best number of topics with best hyperparameters

import numpy as np
import tqdm
from google.colab import files

#With best parameters

# Topics range
min_topics = 5
max_topics = 105
step_size = 5
topics_range = range(min_topics, max_topics, step_size)


model_results = {'Topics': [],'Coherence': []}


if 1 == 1:
    pbar = tqdm.tqdm(total=len(topics_range))
    for k in topics_range:
        lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus, id2word=id2word, num_topics=k, random_state=100, 
        update_every=1, chunksize=100, passes=10, alpha=0.01, eta=0.61, per_word_topics=True
        )

    #print('\nPerplexity: ', lda_model.log_perplexity(corpus))

        coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        model_results['Topics'].append(k)
        model_results['Coherence'].append(coherence_lda)
        #print('\nCoherence Score: ', coherence_lda)
        pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False)
    files.download('lda_tuning_results_new.csv')
    pbar.close()


100%|██████████| 20/20 [1:29:05<00:00, 413.58s/it]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

100%|██████████| 20/20 [1:29:05<00:00, 267.29s/it]


In [11]:
lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus, id2word=id2word, num_topics=20, random_state=100, 
        update_every=1, chunksize=100, passes=10, alpha=0.01, eta=0.61, per_word_topics=True
        )
coherence_model_lda = CoherenceModel(
   model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.6787468993133209


In [10]:
print('Number of unique tokens: %d' % len(id2word)) #dictionary
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 19391
Number of documents: 11314
