In [9]:
%matplotlib inline
import numpy as np
import pandas as pd
from time import time
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [10]:
%%time
newsgroups_train = fetch_20newsgroups(subset='train')

CPU times: user 236 ms, sys: 72.2 ms, total: 308 ms
Wall time: 313 ms


In [11]:
type(newsgroups_train)

sklearn.utils.Bunch

In [31]:
news = fetch_20newsgroups(subset='train', shuffle=True, random_state=28,
                remove=('headers', 'footers', 'quotes'))
news_raw = news.data
print(len(news_raw))



11314


In [14]:
%%time
from pprint import pprint
# view the dictionary keys
pprint(list(newsgroups_train))

['data', 'filenames', 'target_names', 'target', 'DESCR']
CPU times: user 425 µs, sys: 316 µs, total: 741 µs
Wall time: 471 µs


In [102]:
# my version of stack overflow
%%time
# np.c_ is the numpy concatenate function
# which is used to concat iris['data'] and iris['target'] arrays 
# for pandas column argument: concat iris['feature_names'] list
# and string list (in this case one string); you can make this anything you'd like..  
# the original dataset would probably call this ['Species']
df_news = pd.DataFrame(data=[newsgroups_train['data'], newsgroups_train['filenames'],
                       newsgroups_train['target_names'], newsgroups_train['target'],
                       newsgroups_train['DESCR']],
                     columns= list(newsgroups_train['data'], newsgroups_train['filenames'],
                       newsgroups_train['target_names'], newsgroups_train['target'],
                       newsgroups_train['DESCR']))

TypeError: list() takes at most 1 argument (5 given)

In [15]:
%%time
# the different article categories
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
CPU times: user 1.55 ms, sys: 1.84 ms, total: 3.39 ms
Wall time: 2.03 ms


In [16]:
%%time
# understanding the length and shape of the file
print(newsgroups_train.filenames.shape)
print(newsgroups_train.target.shape)
print(newsgroups_train.target[:10])


(11314,)
(11314,)
[ 7  4  4  1 14 16 13  3  2  4]
CPU times: user 885 µs, sys: 1.17 ms, total: 2.05 ms
Wall time: 1.02 ms


In [23]:
%%time
#from collections import Counter
from sklearn.datasets import fetch_20newsgroups
import re

# define text cleaner function for the dataset
def text_cleaner(line:str):
    '''define the function for a text cleaner to run the news through'''
        # replaces new line with a space
    line = re.sub('\n', ' ', line)
        # removes characters outside of a-z w/ a space
    line = re.sub('[^a-z ]', ' ', line)
        # removes things within a [] bracket and replaces w/ a space
    line = re.sub("[[].*?[]]", ' ', line)
        # removes -- and replaces w/ a space
    line = re.sub(r'--', ' ', line)
        # splits sentences and makes it a list 
    line = ' '.join(line.split())
    return line

# deprecation warning had me remove \ in line 14, "[\[].*?[\]]"
# source: https://github.com/pallets/jinja/issues/646

CPU times: user 20 µs, sys: 1e+03 ns, total: 21 µs
Wall time: 28.1 µs


In [25]:
%%time
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text:str):
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in line
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 14.3 µs


In [40]:
# saving for pythonic example of a good for loop
#tokens = [token for token in tokens if len(token) > 0]

In [27]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
## create categories 
categories = ['misc.forsale', 'talk.politics.mideast', 
              'sci.space', 'rec.sport.baseball']

# create a new variable for the reduced dataset
# to 4 categories for faster NLP
news_reduced = fetch_20newsgroups(subset='train', shuffle=True, random_state=28,
                remove=('headers', 'footers', 'quotes'), categories=categories)

CPU times: user 1.39 s, sys: 72.3 ms, total: 1.46 s
Wall time: 1.47 s


In [36]:
# sklearn turns a collection of text documents to a matrix of token counts
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                preprocessor=None,
                                binary=False,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10,
                                encoding='utf-8',)
dtm_tf = tf_vectorizer.fit_transform(news_raw)
print(dtm_tf.shape)

# documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

(11314, 9144)


In [34]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(news_raw)
print(dtm_tfidf.shape)



(11314, 9144)


In [40]:
type(news_reduced)

sklearn.utils.Bunch

In [39]:
news_reduced = text_cleaner(news_reduced)

TypeError: expected string or bytes-like object

In [41]:
%%time
import spacy
nlp = spacy.load('en')
nlp.max_length = 23065807
news_doc = nlp(" ".join(news_reduced["data"]))

CPU times: user 1min 50s, sys: 52.7 s, total: 2min 42s
Wall time: 3min 59s


In [42]:
type(nlp)

spacy.lang.en.English

In [43]:
%%time
news_sents = [[sent, "data"] for sent in news_doc.sents]

# Create one data frame.
sentences = pd.DataFrame(news_sents)
sentences.head()

CPU times: user 287 ms, sys: 637 ms, total: 924 ms
Wall time: 1.94 s


In [44]:
# all the text is being labeled "data." We need it to say the proper publication
sentences.head()

Unnamed: 0,0,1
0,"(Yes, ,, the, Phobos, mission, did, return, so...",data
1,"(The, best, I, 've, seen, had, a, surface, res...",data
2,"(By, \n, the, way, ,, the, new, book, entitled...",data
3,"(The, chapter, is, co, -, authored, by, V.I., ...",data
4,"(Do, n't, \n, know, of, any, ftp, sites, with,...",data
