In [55]:
%matplotlib inline
import numpy as np
import pandas as pd
from time import time
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [56]:
%%time
# create a variable for the 20newsgroups dataset training set
newsgroups_train = fetch_20newsgroups(subset='train')

CPU times: user 309 ms, sys: 128 ms, total: 437 ms
Wall time: 632 ms


In [57]:
type(newsgroups_train)

sklearn.utils.Bunch

# pyLDAvis Analysis

In [58]:
# new variable for LDA analysis, removing meta information and pesky footers
news = fetch_20newsgroups(subset='train', shuffle=True, random_state=28,
                remove=('headers', 'footers', 'quotes'))
news_raw = news.data
print(len(news_raw))

11314


In [59]:
type(news_raw)

list

In [60]:
# shows what variables are associated with an object
dir(news)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [61]:
# The publications 
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [62]:
%%time
## create a limited set of categories of the 20 for analysis
categories = ['misc.forsale', 'talk.politics.mideast', 
              'sci.space', 'rec.sport.baseball', 'comp.graphics']

# create a new variable for the reduced dataset
# to 4 categories for faster analysis testing
news_reduced = fetch_20newsgroups(subset='train', shuffle=True, random_state=28,
                remove=('headers', 'footers', 'quotes'), categories=categories)

CPU times: user 1.97 s, sys: 155 ms, total: 2.13 s
Wall time: 2.69 s


In [63]:
type(news_reduced)

sklearn.utils.Bunch

In [64]:
# no need to reduce the news to 5 categories
len(news_reduced)

5

## Vectorizing the documents

Count Vectorizer:

In [68]:
# sklearn turns a collection of text documents to a matrix of token counts.
# This allows sentences or small paragraphs to become rows.
# TF vecotrizer
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                preprocessor=None,
                                binary=False,
                                # gets ride of a lot of bad characters
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10,
                                encoding='utf-8',)
# fit the model
dtm_tf = tf_vectorizer.fit_transform(news_raw)
# print the shape
print(dtm_tf.shape)

# different token patterns i tried:
# (?u)\\b\\w\\w+\\b
# OG: \b[^\d\W]+\b/g
# "[^a-zA-Z' ]+"
# ^([a-zA-Z]+|\d+|\W)$
# [ \t\n\r\f\v]

(11314, 9144)


In [69]:
news_raw[4]

'I have the following items for sale.  The highest bid for each to arrive\nin my email box by 5:00 pm EDT Wednesday April 21, 1993 gets the item.\n\n1] Skillcraft Senior Chemlab Set 4581\n        Safe for Ages 10 and Up\n        Used little\n        25 bottles of different Chemicals, Plastic Balance, Alcohol Lamp,\n        Test Tube, Litmus Paper\n        Manual with "Over 1100 Experiments"\n        $4 shipping will be added to your bid\n\n2] Eye of the Beholder II\n        Fun game for the IBM PC\n        In original box, with original media and Manual\n        $3 shipping will be added to your bid\n\n3] Clue Book for Eye of the Beholder II\n\tSolve your dilemmas in a hurry and find all the loot\n\t$1 shipping will be added to your bid\n\tFree shipping on this item if you bid highest on this and item 2].\n\n4] Ethernet Transceiver (ST-500 With LanView (AUI to 50 Ohm coaxial))\n        Works fine\n        Has nifty blinking leds for send/receive/collision/power, etc.\n        Built-in 

In [70]:
# news_raw is a list 
type(news_raw)

list

documentation for Count vectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

TF-IDF Vectorizer:

In [71]:
# tf-idf vectorizer
# Converts a collection of raw documents to a matrix of TF-IDF features.
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
# fit the model 
dtm_tfidf = tfidf_vectorizer.fit_transform(news_raw)
# print shape
print(dtm_tfidf.shape)



(11314, 9144)


documentation for tf-idf vectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

# Count Vectorizer Document Visualization

In [87]:
# for TF DTM
lda_tf20 = LatentDirichletAllocation(n_topics=20, random_state=36)
lda_tf20.fit(dtm_tf)

lda_tf10 = LatentDirichletAllocation(n_topics=10, random_state=36)
lda_tf10.fit(dtm_tf)

lda_tf5 = LatentDirichletAllocation(n_topics=5, random_state=36)
lda_tf5.fit(dtm_tf)




LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=5, perp_tol=0.1,
             random_state=36, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

# 20 Topics

In [85]:
#call the model
pyLDAvis.sklearn.prepare(lda_tf20, dtm_tf, tf_vectorizer)

# 10 Topics

In [88]:
pyLDAvis.sklearn.prepare(lda_tf10, dtm_tf, tf_vectorizer)

# 5 Topics

In [89]:
pyLDAvis.sklearn.prepare(lda_tf5, dtm_tf, tf_vectorizer)

# TF-IDF Document Visualization 

In [90]:
# for TFIDF DTM
# 20 topics
lda_tfidf20 = LatentDirichletAllocation(n_topics=20, random_state=36)
lda_tfidf20.fit(dtm_tfidf)

# 10 topics
lda_tfidf10 = LatentDirichletAllocation(n_topics=10, random_state=36)
lda_tfidf10.fit(dtm_tfidf)

# 5 topics
lda_tfidf5 = LatentDirichletAllocation(n_topics=5, random_state=36)
lda_tfidf5.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=5, perp_tol=0.1,
             random_state=36, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## 20 Topics

In [76]:
pyLDAvis.sklearn.prepare(lda_tfidf20, dtm_tfidf, tfidf_vectorizer)

# 10 Topics

In [91]:
pyLDAvis.sklearn.prepare(lda_tfidf10, dtm_tfidf, tfidf_vectorizer)

# 5 Topics

In [92]:
pyLDAvis.sklearn.prepare(lda_tfidf5, dtm_tfidf, tfidf_vectorizer)

## Insert segue to model evaluation

## Intro Jensen-Shannon?

## Jensen-Shannon Divergence & PCA  -- NEEDS MORE WORK!!

In [None]:
lda_tfidf20, dtm_tfidf, tfidf_vectorizer

In [98]:
type(lda_tfidf20)

sklearn.decomposition.online_lda.LatentDirichletAllocation

In [99]:
type(news_raw)

list

In [115]:
from scipy.spatial.distance import pdist, squareform
from scipy.stats import entropy

def _jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

def js_PCoA(distributions):
    """Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis
    (aka Classical Multidimensional Scaling)

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    Returns
    -------
    pcoa : array, shape (`n_dists`, 2)
    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    return _pcoa(dist_matrix)

In [119]:
js = np.array(news_raw)

In [120]:
type(js)

numpy.ndarray

In [118]:
js = js_PCoA(news_raw)

ValueError: A 2-dimensional array must be passed.

In [None]:
jensen-shannon divergence

In [109]:
pyLDAvis.sklearn.prepare(lda_tfidf5, dtm_tfidf, tfidf_vectorizer, mds='pcoa')

A function that takes topic_term_dists as an input and outputs a n_topics by 2 distance matrix. The output approximates the distance between topics. See js_PCoA() for details on the default function. A string representation currently accepts pcoa (or upper case variant), mmds (or upper case variant) and tsne (or upper case variant), if sklearn package is installed for the latter two.

In [107]:
pyLDAvis.js_PCoA()

ValueError: A 2-dimensional array must be passed.

## Traditional NLP??

In [25]:
## import 20newsgorups as a list?

In [4]:
%%time
from pprint import pprint
# view the dictionary keys
pprint(list(newsgroups_train))

['data', 'filenames', 'target_names', 'target', 'DESCR']
CPU times: user 576 µs, sys: 1.19 ms, total: 1.77 ms
Wall time: 990 µs


In [5]:
%%time
# understanding the length and shape of the file
print(newsgroups_train.filenames.shape)
print(newsgroups_train.target.shape)
print(newsgroups_train.target[:10])

(11314,)
(11314,)
[ 7  4  4  1 14 16 13  3  2  4]
CPU times: user 608 µs, sys: 878 µs, total: 1.49 ms
Wall time: 683 µs


In [59]:
%%time
#from collections import Counter
from sklearn.datasets import fetch_20newsgroups
import re

# define text cleaner function for the dataset
def text_cleaner(line:str):
    '''define the function for a text cleaner to run the news through'''
        # replaces new line with a space
    line = re.sub('\n', ' ', line)
        # removes characters outside of a-z w/ a space
    line = re.sub('[^a-z ]', ' ', line)
        # removes things within a [] bracket and replaces w/ a space
    line = re.sub("[[].*?[]]", ' ', line)
        # removes -- and replaces w/ a space
    line = re.sub(r'--', ' ', line)
        # splits sentences and makes it a list 
    line = ' '.join(line.split())
    return line

# deprecation warning had me remove \ in line 14, "[\[].*?[\]]"
# source: https://github.com/pallets/jinja/issues/646

CPU times: user 12 µs, sys: 1e+03 ns, total: 13 µs
Wall time: 16.9 µs


In [21]:
# not necessary due to tfidf vectorization?

#news_reduced = text_cleaner(news_reduced)

In [13]:
%%time
import spacy
nlp = spacy.load('en')
nlp.max_length = 23065807
news_doc = nlp(" ".join(news_reduced["data"]))

CPU times: user 1min 49s, sys: 1min 2s, total: 2min 52s
Wall time: 4min 8s


In [15]:
%%time
news_sents = [[sent, "data"] for sent in news_doc.sents]

# Create one data frame.
sentences = pd.DataFrame(news_sents)
sentences.head()

CPU times: user 263 ms, sys: 428 ms, total: 691 ms
Wall time: 962 ms


In [6]:
%%time
#from collections import Counter
from sklearn.datasets import fetch_20newsgroups
import re

# define text cleaner function for the dataset
def text_cleaner(line:str):
    '''define the function for a text cleaner to run the news through'''
        # replaces new line with a space
    line = re.sub('\n', ' ', line)
        # removes characters outside of a-z w/ a space
    line = re.sub('[^a-z ]', ' ', line)
        # removes things within a [] bracket and replaces w/ a space
    line = re.sub("[[].*?[]]", ' ', line)
        # removes -- and replaces w/ a space
    line = re.sub(r'--', ' ', line)
        # splits sentences and makes it a list 
    line = ' '.join(line.split())
    return line

# deprecation warning had me remove \ in line 14, "[\[].*?[\]]"
# source: https://github.com/pallets/jinja/issues/646

CPU times: user 14 µs, sys: 1e+03 ns, total: 15 µs
Wall time: 19.1 µs


In [7]:
# saving for pythonic example of a good for loop
#tokens = [token for token in tokens if len(token) > 0]

In [16]:
# all the text is being labeled "data." We need it to say the proper publication
sentences.head()

Unnamed: 0,0,1
0,"(Yes, ,, the, Phobos, mission, did, return, so...",data
1,"(The, best, I, 've, seen, had, a, surface, res...",data
2,"(By, \n, the, way, ,, the, new, book, entitled...",data
3,"(The, chapter, is, co, -, authored, by, V.I., ...",data
4,"(Do, n't, \n, know, of, any, ftp, sites, with,...",data


In [25]:
%%time
# Mike--> code from our session last week
from typing import List
def bag_of_words_spacy(doc:spacy.tokens.doc.Doc)->List:
    """takes spacy doc, returns list of most common 2000 lemmas"""
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in doc
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

# Mike--> code from our session last week
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features_df(sentences:List, common_words)->pd.DataFrame:
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['label'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(sentences[0]):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [span.lemma_
                 for span in sentence
                 if (
                     not span.is_punct
                     and not span.is_stop
                     and span.lemma_ in common_words
                 )
                ]
        

         #Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 14.3 µs
