# 1. Loading libs

In [None]:
# importing required libraries
import re
import nltk
from nltk.corpus import movie_reviews
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# 2. preprocessing(punctuations, stopwords, lemmatization)

In [None]:
print(len(movie_reviews.fileids()))
movie_reviews.categories()

2000


['neg', 'pos']

In [None]:
print(len(movie_reviews.fileids('pos')))
print(len(movie_reviews.fileids('neg')))

1000
1000


In [None]:
# Document is a list of (words of review, category of review)
document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]
document[:10]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg'),
 (['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...], 'neg'),
 (['so', 'ask', 'yourself', 'what', '"', '8mm', '"', '(', ...], 'neg'),
 (['that', "'", 's', 'exactly', 'how', 'long', 'the', ...], 'neg'),
 (['call', 'it', 'a', 'road', 'trip', 'for', 'the', ...], 'neg'),
 (['plot', ':', 'a', 'young', 'french', 'boy', 'sees', ...], 'neg')]

In [None]:
print(document[0][0])
print(document[0][1])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
neg


In [None]:
len(document)

2000

In [None]:
from nltk import WordNetLemmatizer
punctuations = string.punctuation
eng_stopwords = stopwords.words('english')

def doLemmatizeWord(text):
    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word) for word in text]
    return lemma

document_punc_stopwords_lem = []

for doc in document :
    #print(doc[0])
    words_wo_puncts = [x for x in doc[0] if x not in punctuations]
    #print(len(words_wo_puncts))
    words_wo_puncts_stopwords = [x for x in words_wo_puncts if x not in eng_stopwords]
    #print(words_wo_puncts_stopwords)
    didWordNetLem = doLemmatizeWord(words_wo_puncts_stopwords)
    sentiment = 0
    if (doc[1]=='neg') :
        sentiment = 1
    document_punc_stopwords_lem.append((didWordNetLem, sentiment))


In [None]:
import pandas as pd

df = pd.DataFrame(document_punc_stopwords_lem, columns =['Review', 'Sentiment'])
df['Final']=df['Review'].apply(lambda x: ''.join(i+' ' for i in x))
df.head()

Unnamed: 0,Review,Sentiment,Final
0,"[plot, two, teen, couple, go, church, party, d...",1,plot two teen couple go church party drink dri...
1,"[happy, bastard, quick, movie, review, damn, y...",1,happy bastard quick movie review damn y2k bug ...
2,"[movie, like, make, jaded, movie, viewer, than...",1,movie like make jaded movie viewer thankful in...
3,"[quest, camelot, warner, bros, first, feature,...",1,quest camelot warner bros first feature length...
4,"[synopsis, mentally, unstable, man, undergoing...",1,synopsis mentally unstable man undergoing psyc...


# 3. CountVectorizer -> unigrams bigrams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
'''
 For example an ngram_range of (1, 1) means only unigrams,
 (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.
 Only applies if analyzer is not callable.
'''

uni_vectorizer = CountVectorizer(ngram_range=(1,1))
cv_df_uni = uni_vectorizer.fit_transform(df['Final'])
print(cv_df_uni.shape)


bi_vectorizer = CountVectorizer(ngram_range=(2,2))
cv_df_bi = bi_vectorizer.fit_transform(df['Final'])
print(cv_df_bi.shape)

(2000, 35099)
(2000, 509148)


# 4. TF-IDF  -> unigrams bigrams

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

uni_tfvectorizer = TfidfVectorizer(ngram_range = (1,1))
tfidf_df_uni = uni_tfvectorizer.fit_transform(df['Final'])
print(tfidf_df_uni.shape)

bi_tfvectorizer = TfidfVectorizer(ngram_range = (2,2))
tfidf_df_bi = bi_tfvectorizer.fit_transform(df['Final'])
print(tfidf_df_bi.shape)


(2000, 35099)
(2000, 509148)


# 5. MultinomialNB


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:

cv_uni_train_X, cv_uni_test_X, cv_uni_train_Y, cv_uni_test_y = train_test_split(cv_df_uni, df['Sentiment'], test_size=0.30, random_state=17)
cv_bi_train_X, cv_bi_test_X, cv_bi_train_Y, cv_bi_test_y = train_test_split(cv_df_bi, df['Sentiment'], test_size=0.30, random_state=17)
tfidf_uni_train_X, tfidf_uni_test_X, tfidf_uni_train_Y, tfidf_uni_test_y = train_test_split(tfidf_df_uni, df['Sentiment'], test_size=0.30, random_state=17)
tfidf_bi_train_X, tfidf_bi_test_X, tfidf_bi_train_Y, tfidf_bi_test_y = train_test_split(tfidf_df_bi, df['Sentiment'], test_size=0.30, random_state=17)


## 5.1 CountVectorizer Unigrams

In [None]:
clf_uni_cv = MultinomialNB()
clf_uni_cv.fit(cv_uni_train_X, cv_uni_train_Y)
#MultinomialNB()
cv_uni_test_y_pre= clf_uni_cv.predict(cv_uni_test_X)

print('accuracy_score: %.3f' % accuracy_score(cv_uni_test_y, cv_uni_test_y_pre))

accuracy_score: 0.833


## 5.2 CountVectorizer Bigrams

In [None]:
clf_bi_cv = MultinomialNB()
clf_bi_cv.fit(cv_bi_train_X, cv_bi_train_Y)
#MultinomialNB()
cv_bi_test_y_pre= clf_bi_cv.predict(cv_bi_test_X)

print('accuracy_score: %.3f' % accuracy_score(cv_bi_test_y, cv_bi_test_y_pre))

accuracy_score: 0.747


## 5.3 Tf-Idf Unigrams

In [None]:
clf_uni_tfidf = MultinomialNB()
clf_uni_tfidf.fit(tfidf_uni_train_X, tfidf_uni_train_Y)
#MultinomialNB()
tfidf_uni_test_y_pre= clf_uni_tfidf.predict(tfidf_uni_test_X)

print('accuracy_score: %.3f' % accuracy_score(tfidf_uni_test_y, tfidf_uni_test_y_pre))

accuracy_score: 0.785


## 5.4 TF-IDF Bigrams

In [None]:
clf_bi_tfidf = MultinomialNB()
clf_bi_tfidf.fit(tfidf_bi_train_X, tfidf_bi_train_Y)
#MultinomialNB()
tfidf_bi_test_y_pre= clf_bi_tfidf.predict(tfidf_bi_test_X)

print('accuracy_score: %.3f' % accuracy_score(tfidf_bi_test_y, tfidf_bi_test_y_pre))

accuracy_score: 0.752


## 6. Use nltk.collocations functions to calculate the top 10 bigram collocations based on PPMI

In [None]:
from nltk.collocations import *
import nltk
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [None]:
len(df['Final'].values.tolist())

2000

In [None]:
'''
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(df['Final'].values.tolist())
finder.nbest(bigram_measures.pmi, 10)
'''

In [None]:
words_wo_puncts = [x for x in movie_reviews.words() if x not in punctuations]
words_wo_puncts_stopwords = [x for x in words_wo_puncts if x not in eng_stopwords]
didWordNetLem = doLemmatizeWord(words_wo_puncts_stopwords)

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(didWordNetLem)
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

[('_october', 'sky_'),
 ('_patlabor', 'movie_'),
 ('alanis', 'morissette'),
 ('arija', 'bareikis'),
 ('bokeem', 'woodbine'),
 ('edie', 'mcclurg'),
 ('gabriella', 'pescucci'),
 ('gedde', 'watanabe'),
 ('heath', 'ledger'),
 ('igor', 'stravinsky')]

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(movie_reviews.words())
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

[('_october', 'sky_'),
 ('alanis', 'morissette'),
 ('arija', 'bareikis'),
 ('battleship', 'potemkin'),
 ('bokeem', 'woodbine'),
 ('edie', 'mcclurg'),
 ('gabriella', 'pescucci'),
 ('gedde', 'watanabe'),
 ('heath', 'ledger'),
 ('igor', 'stravinsky')]

## 7. TopicModeling

In [None]:
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 32.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyLDAvis, sklearn
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=3f7895f399171d2e86ea5278ef35f42b62bc2d5fd6fc80cd614db1fd15a12aca
  Stored in directory: /root/.cache/pip/wheels/c9/21/f6/17bcf2667e8a68532ba2fbf6d5c72fdf4c7f7d9abfa4852d2f
  Building wheel for sklearn (setup

In [None]:
from __future__ import print_function
import pyLDAvis

import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# Load 20 newsgroups dataset
# First, the 20 newsgroups dataset available in sklearn is loaded. As always, the headers, footers and quotes are removed.

  from collections import Iterable
  from collections import Mapping


In [None]:
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
docs_raw = newsgroups.data
print(len(docs_raw))

11314


In [None]:
categories = ['sci.med', 'sci.space', 'talk.politics.guns']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [None]:
print(newsgroups_train.keys())
print(newsgroups_train['target_names'])

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
['sci.med', 'sci.space', 'talk.politics.guns']


In [None]:
from pprint import pprint
pprint(list(newsgroups_train.target_names))
pprint(list(newsgroups_test.target_names))

['sci.med', 'sci.space', 'talk.politics.guns']
['sci.med', 'sci.space', 'talk.politics.guns']


In [None]:
print(len(newsgroups_train['data']))
print(len(newsgroups_train['target']))

print(len(newsgroups_test['data']))
print(len(newsgroups_test['target']))

1733
1733
1154
1154


Row Data

In [None]:
docs_raw_train = newsgroups_train['data']
docs_raw_train[:2]

['\n\nThere are, but not any that would help Texans: In many states,\nsuch laws have been found to violate the state constitution. \nBut the federal Second Amendment does not apply directly to the\nstates. It was written to limit the federal government only. \nThe Fourteenth Amendment was written to extend the restrictions\nof the Bill of Rights to the state level. However, the exact\nwording of the Fourteenth Amendment is very vague. The Supreme\nCourt has been dancing around the issue without facing it\ndirectly for over 100 years. In practice, the Bill of Right\n(indirectly applies through the Fourteenth) applies to the\nstate governments only if the Supreme Court has ruled that \nparticular provision. The Court has made no such rulings on\nthe Second Amendment.',
 " A cash award is OK. A time limit would be nice. You can't give away\nmining rights (assuming there's anything to mine) because you don't own\nthem.\n -----------------------------------------------------------------\n .

Lowercase

In [None]:
raw_data = newsgroups_train['data']
lower_raw_data = []

for row in raw_data:
  lower_raw_data.append(row.lower())

lower_raw_data[:2]

['\n\nthere are, but not any that would help texans: in many states,\nsuch laws have been found to violate the state constitution. \nbut the federal second amendment does not apply directly to the\nstates. it was written to limit the federal government only. \nthe fourteenth amendment was written to extend the restrictions\nof the bill of rights to the state level. however, the exact\nwording of the fourteenth amendment is very vague. the supreme\ncourt has been dancing around the issue without facing it\ndirectly for over 100 years. in practice, the bill of right\n(indirectly applies through the fourteenth) applies to the\nstate governments only if the supreme court has ruled that \nparticular provision. the court has made no such rulings on\nthe second amendment.',
 " a cash award is ok. a time limit would be nice. you can't give away\nmining rights (assuming there's anything to mine) because you don't own\nthem.\n -----------------------------------------------------------------\n .

remove html/uml

In [None]:
#regex = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")

def findRegex(pattern) :
  regex = re.compile(pattern)
  i = 0
  for row in lower_raw_data:
    result = regex.findall(row)
    if (len(result)!=0) :
      print(i)
      display(result)
      break;
    i = i+1

pattern = r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"
findRegex(pattern)

2


['']

In [None]:
lower_raw_data[2]

'\nno one else seems to know, so i\'ll post this.\n\nthis topic came up on sci.physics.fusion shortly after the cold-fusion\nflap started.  as i recall, its been done to some experimental mice.\nthey showed various ill effects and eventually died.  the reason is\nthat deuterium does not have exactly the same reaction rates as\nhydrogen due to its extra mass (which causes lower velocity, boltzman\nconstant, mumble).  this throws various bits of body biochemistry out\nof kilter, and you get sick and die.\n\ni\'ve never heard of anyone being poisened this way, in or out of real\nlife.  the process takes quite a while.  if anyone wants to write this\nbook, i would imagine you would have to:\n\n1: replace a significant fraction of the water in the body with heavy\n   water.\n\n2: wait while normal breakdown and repair processes cause other\n   molecules in the body to be synthesised using the deuterium.\n\nduring this process the victim would gradually deteriorate and\neventually die, but i

In [None]:
uml_html_lower_raw_data = []

for row in lower_raw_data:
  html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
  html_row = re.sub(html, "", row)
  uml_html_row = re.sub(r"https?://\S+|www\.\S+", "",html_row)
  uml_html_lower_raw_data.append(uml_html_row)

uml_html_lower_raw_data[2]

'\nno one else seems to know, so i\'ll post this.\n\nthis topic came up on sci.physics.fusion shortly after the cold-fusion\nflap started.  as i recall, its been done to some experimental mice.\nthey showed various ill effects and eventually died.  the reason is\nthat deuterium does not have exactly the same reaction rates as\nhydrogen due to its extra mass (which causes lower velocity, boltzman\nconstant, mumble).  this throws various bits of body biochemistry out\nof kilter, and you get sick and die.\n\ni\'ve never heard of anyone being poisened this way, in or out of real\nlife.  the process takes quite a while.  if anyone wants to write this\nbook, i would imagine you would have to:\n\n1: replace a significant fraction of the water in the body with heavy\n   water.\n\n2: wait while normal breakdown and repair processes cause other\n   molecules in the body to be synthesised using the deuterium.\n\nduring this process the victim would gradually deteriorate and\neventually die, but i

remove asciicode

In [None]:
pattern = r'[^\x00-\x7f]'
findRegex(pattern)

665


['ÿ']

In [None]:
uml_html_lower_raw_data[665]

'hello,  i am not sure if this is the right conference to ask this\nquestion, however, here i go..  i am a commercial fisherman and i \nfell about 3 weeks ago down into the hold of the boat and broke or\ncracked a rib and wrenched and bruised my back and left arm.\n  my question,  i have been to a doctor and was told that it was \nbest to do nothing and it would heal up with no long term effect, and \nindeed i am about 60 % better, however, the work i do is very \nhard and i am still not able to go back to work.  the thing that worries me\nis the movement or "clunking" i feel and hear back there when i move \ncertain ways...  i heard some one talking about the rib they broke \nyears ago and that it still bothers them.ÿ.  any opinions?\nthanx and cheers'

In [None]:
ascii_uml_html_lower_raw_data = []

for row in uml_html_lower_raw_data:
  ascii_uml_html_lower_raw_data.append(re.sub(r'[^\x00-\x7f]', "", row))

ascii_uml_html_lower_raw_data[665]

'hello,  i am not sure if this is the right conference to ask this\nquestion, however, here i go..  i am a commercial fisherman and i \nfell about 3 weeks ago down into the hold of the boat and broke or\ncracked a rib and wrenched and bruised my back and left arm.\n  my question,  i have been to a doctor and was told that it was \nbest to do nothing and it would heal up with no long term effect, and \nindeed i am about 60 % better, however, the work i do is very \nhard and i am still not able to go back to work.  the thing that worries me\nis the movement or "clunking" i feel and hear back there when i move \ncertain ways...  i heard some one talking about the rib they broke \nyears ago and that it still bothers them..  any opinions?\nthanx and cheers'

remove punctuations

In [None]:
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
puncts_ascii_uml_html_lower_raw_data = []
# = [x for x in ascii_uml_html_lower_raw_data if x not in punctuations]

for row in ascii_uml_html_lower_raw_data:
  pattern = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
  puncts_ascii_uml_html_lower_raw_data.append(re.sub(pattern, "",row))

puncts_ascii_uml_html_lower_raw_data[:2]

['\n\nthere are but not any that would help texans in many states\nsuch laws have been found to violate the state constitution \nbut the federal second amendment does not apply directly to the\nstates it was written to limit the federal government only \nthe fourteenth amendment was written to extend the restrictions\nof the bill of rights to the state level however the exact\nwording of the fourteenth amendment is very vague the supreme\ncourt has been dancing around the issue without facing it\ndirectly for over 100 years in practice the bill of right\nindirectly applies through the fourteenth applies to the\nstate governments only if the supreme court has ruled that \nparticular provision the court has made no such rulings on\nthe second amendment',
 ' a cash award is ok a time limit would be nice you cant give away\nmining rights assuming theres anything to mine because you dont own\nthem\n \n sig files are like strings  every yoyos got one']

remove escape \n \t

In [None]:
line = "own\nthem\n \n sig files \tare \tlike st"
display(re.sub('[\\n\\t]', "",line))


'ownthem  sig files are like st'

In [None]:
escape_puncts_ascii_uml_html_lower_raw_data = []

for row in puncts_ascii_uml_html_lower_raw_data:
  escape_puncts_ascii_uml_html_lower_raw_data.append(re.sub('[\\n\\t]', "",row))

escape_puncts_ascii_uml_html_lower_raw_data[:2]

['there are but not any that would help texans in many statessuch laws have been found to violate the state constitution but the federal second amendment does not apply directly to thestates it was written to limit the federal government only the fourteenth amendment was written to extend the restrictionsof the bill of rights to the state level however the exactwording of the fourteenth amendment is very vague the supremecourt has been dancing around the issue without facing itdirectly for over 100 years in practice the bill of rightindirectly applies through the fourteenth applies to thestate governments only if the supreme court has ruled that particular provision the court has made no such rulings onthe second amendment',
 ' a cash award is ok a time limit would be nice you cant give awaymining rights assuming theres anything to mine because you dont ownthem  sig files are like strings  every yoyos got one']

remove stopwords, tokenizer

In [None]:
#stopwords_escape_puncts_ascii_uml_html_lower_raw_data = [x for x in escape_puncts_ascii_uml_html_lower_raw_data if x not in eng_stopwords]

df_newsgroups_train = pd.DataFrame(escape_puncts_ascii_uml_html_lower_raw_data, columns =['CleanText'])
df_newsgroups_train.head(10)

df_newsgroups_train['TokenizedText'] = df_newsgroups_train['CleanText'].apply(nltk.word_tokenize)
df_newsgroups_train.head()

Unnamed: 0,CleanText,TokenizedText
0,there are but not any that would help texans i...,"[there, are, but, not, any, that, would, help,..."
1,a cash award is ok a time limit would be nice...,"[a, cash, award, is, ok, a, time, limit, would..."
2,no one else seems to know so ill post thisthis...,"[no, one, else, seems, to, know, so, ill, post..."
3,dcx as is today isnt suitable for this however...,"[dcx, as, is, today, isnt, suitable, for, this..."
4,yes i remember that now well in that case the...,"[yes, i, remember, that, now, well, in, that, ..."


In [None]:
stop = set(stopwords.words('english'))
df_newsgroups_train['RemoveStopWords'] = df_newsgroups_train['TokenizedText'].apply(lambda x: [word for word in x if word not in stop])
df_newsgroups_train.head()


Unnamed: 0,CleanText,TokenizedText,RemoveStopWords
0,there are but not any that would help texans i...,"[there, are, but, not, any, that, would, help,...","[would, help, texans, many, statessuch, laws, ..."
1,a cash award is ok a time limit would be nice...,"[a, cash, award, is, ok, a, time, limit, would...","[cash, award, ok, time, limit, would, nice, ca..."
2,no one else seems to know so ill post thisthis...,"[no, one, else, seems, to, know, so, ill, post...","[one, else, seems, know, ill, post, thisthis, ..."
3,dcx as is today isnt suitable for this however...,"[dcx, as, is, today, isnt, suitable, for, this...","[dcx, today, isnt, suitable, however, followon..."
4,yes i remember that now well in that case the...,"[yes, i, remember, that, now, well, in, that, ...","[yes, remember, well, case, cones, indeedcolor..."


LemmatizedText

In [None]:
df_newsgroups_train['LemmatizedText'] = df_newsgroups_train['RemoveStopWords'].apply(lambda x: doLemmatizeWord(x))
df_newsgroups_train.head()


Unnamed: 0,CleanText,TokenizedText,RemoveStopWords,LemmatizedText
0,there are but not any that would help texans i...,"[there, are, but, not, any, that, would, help,...","[would, help, texans, many, statessuch, laws, ...","[would, help, texan, many, statessuch, law, fo..."
1,a cash award is ok a time limit would be nice...,"[a, cash, award, is, ok, a, time, limit, would...","[cash, award, ok, time, limit, would, nice, ca...","[cash, award, ok, time, limit, would, nice, ca..."
2,no one else seems to know so ill post thisthis...,"[no, one, else, seems, to, know, so, ill, post...","[one, else, seems, know, ill, post, thisthis, ...","[one, else, seems, know, ill, post, thisthis, ..."
3,dcx as is today isnt suitable for this however...,"[dcx, as, is, today, isnt, suitable, for, this...","[dcx, today, isnt, suitable, however, followon...","[dcx, today, isnt, suitable, however, followon..."
4,yes i remember that now well in that case the...,"[yes, i, remember, that, now, well, in, that, ...","[yes, remember, well, case, cones, indeedcolor...","[yes, remember, well, case, cone, indeedcolor,..."


In [None]:
df_newsgroups_train['FINAL']=df_newsgroups_train['LemmatizedText'].apply(lambda x: ''.join(i+' ' for i in x))
df_newsgroups_train.head()

Unnamed: 0,CleanText,TokenizedText,RemoveStopWords,LemmatizedText,FINAL
0,there are but not any that would help texans i...,"[there, are, but, not, any, that, would, help,...","[would, help, texans, many, statessuch, laws, ...","[would, help, texan, many, statessuch, law, fo...",would help texan many statessuch law found vio...
1,a cash award is ok a time limit would be nice...,"[a, cash, award, is, ok, a, time, limit, would...","[cash, award, ok, time, limit, would, nice, ca...","[cash, award, ok, time, limit, would, nice, ca...",cash award ok time limit would nice cant give ...
2,no one else seems to know so ill post thisthis...,"[no, one, else, seems, to, know, so, ill, post...","[one, else, seems, know, ill, post, thisthis, ...","[one, else, seems, know, ill, post, thisthis, ...",one else seems know ill post thisthis topic ca...
3,dcx as is today isnt suitable for this however...,"[dcx, as, is, today, isnt, suitable, for, this...","[dcx, today, isnt, suitable, however, followon...","[dcx, today, isnt, suitable, however, followon...",dcx today isnt suitable however followon sdiof...
4,yes i remember that now well in that case the...,"[yes, i, remember, that, now, well, in, that, ...","[yes, remember, well, case, cones, indeedcolor...","[yes, remember, well, case, cone, indeedcolor,...",yes remember well case cone indeedcolor sensit...


CountVectorizer

In [None]:
# Convert to document-term matrix
# Next, the raw documents are converted into document-term matrix, possibly as raw counts or in TF-IDF form.

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5,
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(df_newsgroups_train['FINAL'])
print(dtm_tf.shape)

(1733, 2156)


TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(df_newsgroups_train['FINAL'])
print(dtm_tfidf.shape)

(1733, 2156)




In [None]:
# Fit Latent Dirichlet Allocation models - this step takes a while!

# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf.fit(dtm_tf)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)

'''
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_components=20, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)'''


"\nLatentDirichletAllocation(batch_size=128, doc_topic_prior=None,\n             evaluate_every=-1, learning_decay=0.7,\n             learning_method='online', learning_offset=10.0,\n             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,\n             n_jobs=1, n_components=20, perp_tol=0.1, random_state=0,\n             topic_word_prior=None, total_samples=1000000.0, verbose=0)"

In [None]:
# Visualizing the models with pyLDAvis, using the term frequency (tf) vectors
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
# Visualizing the models with pyLDAvis, using the tf-idf vectors
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


Compared to Componenet5

In [None]:
lda_tf_c5 = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tf_c5.fit(dtm_tf)

# for TFIDF DTM
lda_tfidf_c5 = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tfidf_c5.fit(dtm_tfidf)

'''
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_components=20, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)'''


"\nLatentDirichletAllocation(batch_size=128, doc_topic_prior=None,\n             evaluate_every=-1, learning_decay=0.7,\n             learning_method='online', learning_offset=10.0,\n             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,\n             n_jobs=1, n_components=20, perp_tol=0.1, random_state=0,\n             topic_word_prior=None, total_samples=1000000.0, verbose=0)"

In [None]:
# Visualizing the models with pyLDAvis, using the term frequency (tf) vectors
pyLDAvis.sklearn.prepare(lda_tf_c5, dtm_tf, tf_vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
# Visualizing the models with pyLDAvis, using the tf-idf vectors
pyLDAvis.sklearn.prepare(lda_tfidf_c5, dtm_tfidf, tfidf_vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


Compared 'learning_decay'

In [None]:
lda_tf_c5_ld01 = LatentDirichletAllocation(n_components=5, random_state=0, learning_decay=0.01)
lda_tf_c5_ld01.fit(dtm_tf)

lda_tf_c5_ld1 = LatentDirichletAllocation(n_components=5, random_state=0, learning_decay=1)
lda_tf_c5_ld1.fit(dtm_tf)

LatentDirichletAllocation(learning_decay=1, n_components=5, random_state=0)

In [None]:
pyLDAvis.sklearn.prepare(lda_tf_c5_ld01, dtm_tf, tf_vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
pyLDAvis.sklearn.prepare(lda_tf_c5_ld1, dtm_tf, tf_vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
lda_tf_example = LatentDirichletAllocation(n_components=5, random_state=0, learning_decay=0.1, max_doc_update_iter=1000)
lda_tf_example.fit(dtm_tf)

'''
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_components=20, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)'''

pyLDAvis.sklearn.prepare(lda_tf_example, dtm_tf, tf_vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
