In [21]:
import sklearn
from sklearn.datasets import load_files

moviedir = r'/Users/akhilakumaripuppala/nltk_data/corpora/movie_reviews'

# loading all files as training data. 
movie_train = load_files(moviedir, shuffle=True)

In [22]:
len(movie_train.data)
movie_train.data[0]

b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so called dark thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( robin tunney ) which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane ( arnold himself ) . \nwith the help of a trusty sidekick ( kevin pollack ) , they will stop at nothing to let the devil take over the world ! \nparts of this are actually so absurd , that they would fit right in with dogma . \nyes , the film is 

In [23]:
movie_train.target

array([0, 1, 1, ..., 1, 0, 0])

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk

# initialize movie_vector object, and then turn movie train data into a vector 
movie_vec_CV_unigram = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize,ngram_range=(1, 1))
movie_counts_CV_unigram = movie_vec_CV_unigram.fit_transform(movie_train.data)

movie_vec_CV_bigram = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize,ngram_range=(2, 2))
movie_counts_CV_bigram = movie_vec_CV_bigram.fit_transform(movie_train.data)

movie_vec_TF_unigram = TfidfVectorizer(min_df=2, tokenizer=nltk.word_tokenize,ngram_range=(1, 1))
movie_counts_TF_unigram = movie_vec_TF_unigram.fit_transform(movie_train.data)

movie_vec_TF_bigram = TfidfVectorizer(min_df=2, tokenizer=nltk.word_tokenize,ngram_range=(2, 2))
movie_counts_TF_bigram = movie_vec_TF_bigram.fit_transform(movie_train.data)

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
names = {'movie_counts_CV_unigram': movie_counts_CV_unigram,
         'movie_counts_CV_bigram' : movie_counts_CV_bigram,
         'movie_counts_TF_unigram' : movie_counts_TF_unigram,
         'movie_counts_TF_bigram': movie_counts_TF_bigram,
        }
#names = movie_counts_CV_unigram,movie_counts_CV_bigram,movie_counts_TF_unigram,movie_counts_TF_unigram]
for i in names:
    docs_train, docs_test, y_train, y_test = train_test_split(
        names[i], movie_train.target, test_size = 0.20, random_state = 12)
    clf = MultinomialNB().fit(docs_train, y_train)
    y_pred = clf.predict(docs_test)
    print("using " + str(i) + " the accuracy is: ")
    print(sklearn.metrics.accuracy_score(y_test, y_pred))

using movie_counts_CV_unigram the accuracy is: 
0.795
using movie_counts_CV_bigram the accuracy is: 
0.83
using movie_counts_TF_unigram the accuracy is: 
0.82
using movie_counts_TF_bigram the accuracy is: 
0.8175


In [None]:
#one of the  difference is that the TfidfVectorizer() returns floats while the CountVectorizer() returns ints

# TfidfVectorizer and CountVectorizer both are methods for converting text data into vectors as model can process 
# only numerical data.

# In CountVectorizer we only count the number of times a word appears in the document which results in biasing in 
# favour of most frequent words. this ends up in ignoring rare words which could have helped is in processing our 
# data more efficiently.

# To overcome this , we use TfidfVectorizer .

# In TfidfVectorizer we consider overall document weightage of a word. It helps us in dealing with most frequent words.
# Using it we can penalize them. TfidfVectorizer weights the word counts by a measure of how often they appear in the 
# documents.

In [26]:
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import warnings
warnings.filterwarnings(action = 'ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [93]:
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

In [94]:
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [95]:
cats = ['sci.med', 'sci.space', 'talk.politics.guns']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
docs_raw = newsgroups_train.data
print(len(docs_raw))

1733


In [96]:
len(newsgroups_train.target)

1733

In [97]:
len(newsgroups_train.data)

1733

In [102]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)


tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)

lda_tf = LatentDirichletAllocation(n_components=15, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=15, random_state=0)
lda_tfidf.fit(dtm_tfidf)

pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

(1733, 3956)
(1733, 3956)


In [103]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [104]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

In [105]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)


tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)

lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tfidf.fit(dtm_tfidf)

pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

(1733, 3956)
(1733, 3956)


In [106]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer, mds='mmds')

In [107]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer, mds='tsne')

In [None]:
pyLDAvis is a interactive LDA visualization python package. 
The area of circle represents the importance of each topic over the entire corpus, 
the distance between the center of circles indicate the similarity between topics. 
For each topic, the histogram on the right side listed the top 30 most relevant terms. 
LDA helped me extracted 20 main topics

Latent Dirichlet allocation is trained on non-labeled documents.
LDA is typically evaluated by either measuring performance on some secondary task, such as document classification or information retrieval,
or by estimating the probability of unseen held-out documents given some training documents. 
A better model will give rise to a higher probability of held-out documents, on average. 

In [None]:
#gensimvis.prepare LDA model

In [78]:
import re
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
data = newsgroups_train.data
data_words = list(sent_to_words(data))
print(data_words[:1])

[['from', 'frank', 'crary', 'subject', 're', 'carrying', 'arms', 'nntp', 'posting', 'host', 'ucsu', 'colorado', 'edu', 'organization', 'university', 'of', 'colorado', 'boulder', 'distribution', 'usa', 'lines', 'in', 'article', 'dan', 'white', 'writes', 'have', 'question', 'about', 'the', 'second', 'amendment', 'that', 'has', 'bothered', 'me', 'for', 'awhile', 'the', 'amendment', 'guarentees', 'our', 'right', 'to', 'keep', 'and', 'bear', 'arms', 'currently', 'the', 'gun', 'prohibitionists', 'are', 'trying', 'to', 'restrict', 'or', 'eliminate', 'our', 'right', 'to', 'keep', 'arms', 'with', 'the', 'brady', 'bill', 'and', 'the', 'assault', 'weapon', 'ban', 'however', 'havent', 'we', 'already', 'lost', 'our', 'right', 'to', 'bear', 'arms', 'it', 'seems', 'that', 'in', 'most', 'states', 'like', 'texas', 'citizen', 'may', 'own', 'gun', 'and', 'carry', 'while', 'at', 'his', 'home', 'or', 'business', 'but', 'citizen', 'is', 'severely', 'restricted', 'from', 'bearing', 'outside', 'these', 'areas

In [79]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

id2word = Dictionary(data_words)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_words]
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 6), (4, 3), (5, 2), (6, 2), (7, 1), (8, 3), (9, 1), (10, 4), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2), (19, 1), (20, 2), (21, 3), (22, 1), (23, 2), (24, 1), (25, 1), (26, 3), (27, 1), (28, 1), (29, 2), (30, 2), (31, 2), (32, 2), (33, 1), (34, 1), (35, 1), (36, 4), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 2), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 2), (54, 2), (55, 1), (56, 3), (57, 2), (58, 2), (59, 1), (60, 1), (61, 1), (62, 2), (63, 4), (64, 2), (65, 1), (66, 1), (67, 1), (68, 1), (69, 2), (70, 1), (71, 1), (72, 2), (73, 1), (74, 6), (75, 1), (76, 3), (77, 1), (78, 3), (79, 2), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 2), (96, 7), (97, 2), (98, 2), (99, 2), (100, 1), (101, 3), (102, 2), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)

In [80]:
from gensim.models.ldamodel import LdaModel
from pprint import pprint

#What do these tuples mean? Let’s convert them into human readable format to understand:
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=10, 
                   random_state=0,
                   chunksize=10,
                   alpha='auto',
                   per_word_topics=True)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.000*"advocating" + 0.000*"deviant" + 0.000*"oooops" + 0.000*"safties" + '
  '0.000*"wrath" + 0.000*"fiddling" + 0.000*"anniversery" + 0.000*"poland" + '
  '0.000*"marking" + 0.000*"encurred"'),
 (1,
  '0.000*"advocating" + 0.000*"deviant" + 0.000*"oooops" + 0.000*"safties" + '
  '0.000*"wrath" + 0.000*"fiddling" + 0.000*"anniversery" + 0.000*"poland" + '
  '0.000*"marking" + 0.000*"encurred"'),
 (2,
  '0.000*"advocating" + 0.000*"deviant" + 0.000*"oooops" + 0.000*"safties" + '
  '0.000*"wrath" + 0.000*"fiddling" + 0.000*"anniversery" + 0.000*"poland" + '
  '0.000*"marking" + 0.000*"encurred"'),
 (3,
  '0.050*"the" + 0.033*"to" + 0.022*"that" + 0.019*"is" + 0.018*"it" + '
  '0.018*"in" + 0.017*"of" + 0.014*"you" + 0.014*"and" + 0.011*"have"'),
 (4,
  '0.018*"item" + 0.011*"denver" + 0.009*"substance" + 0.000*"sec" + '
  '0.000*"bruce" + 0.000*"extract" + 0.000*"alpha" + 0.000*"spiros" + '
  '0.000*"wray" + 0.000*"bailey"'),
 (5,
  '0.000*"advocating" + 0.000*"deviant" + 0.000*

In [81]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
vis_data = gensimvis.prepare(lda_model,corpus,id2word)
pyLDAvis.display(vis_data)