In [7]:
%pylab inline
import nltk
import json
import re
import time
import sqlalchemy
import pandas as pd
import warnings
import isbnlib
from bs4 import BeautifulSoup
import requests
from html.parser import HTMLParser
import urllib
from urllib import urlopen

warnings.filterwarnings('ignore')

# record linkage package
import recordlinkage as rl
from recordlinkage.preprocessing import clean, phonenumbers, phonetic

from __future__ import print_function
from six.moves import zip, range 
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, roc_auc_score, auc
from sklearn import preprocessing
from collections import Counter, OrderedDict
from nltk.corpus import stopwords
from nltk import SnowballStemmer

nltk.download('stopwords')

Populating the interactive namespace from numpy and matplotlib
[nltk_data] Downloading package stopwords to
[nltk_data]     /nfshome/ag4215/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
url = 'https://www.consumerreports.org/cro/index.htm'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
data = soup.findAll(text=True)

def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    return True

result = filter(visible, data)

In [9]:
RE_PREPROCESS = re.compile( r'\W+|\d+' )

In [10]:
processed_corpus = np.array( [ re.sub( RE_PREPROCESS, ' ', description ).lower() for description in result ] )

In [11]:
eng_stopwords = stopwords.words('english')

In [12]:
def create_bag_of_words( corpus,
                         NGRAM_RANGE = ( 0, 1 ),
                         stop_words = None,
                         stem = False,
                         MIN_DF = 0.005,
                         MAX_DF = 0.95,
                         USE_IDF = False ):

    """
    Turn a corpus of text into a bag-of-words.
    
    Parameters
    -----------
    corpus: ls
        test of documents in corpus    
    NGRAM_RANGE: tuple
        range of N-gram. Default (0,1)
    stop_words: ls
        list of commonly occuring words that have little semantic
        value
    stem: bool
        use a stemmer to stem words
    MIN_DF: float
       exclude words that have a frequency less than the threshold
    MAX_DF: float
        exclude words that have a frequency greater than the threshold
    
    
    Returns
    -------
    bag_of_words: scipy sparse matrix
        scipy sparse matrix of text
    features:
        ls of words
    """
    #parameters for vectorizer 
    ANALYZER = "word" #unit of features are single words rather then phrases of words 
    STRIP_ACCENTS = 'unicode'
    stemmer = nltk.SnowballStemmer("english")

    if stem:
        tokenize = lambda x: [stemmer.stem(i) for i in x.split()]
    else:
        tokenize = None
    vectorizer = CountVectorizer(analyzer=ANALYZER,
                                tokenizer=tokenize, 
                                ngram_range=NGRAM_RANGE,
                                stop_words = stop_words,
                                strip_accents=STRIP_ACCENTS,
                                min_df = MIN_DF,
                                max_df = MAX_DF)
    
    bag_of_words = vectorizer.fit_transform( corpus ) #transform our corpus is a bag of words 
    features = vectorizer.get_feature_names()

    if USE_IDF:
        NORM = None #turn on normalization flag
        SMOOTH_IDF = True #prvents division by zero errors
        SUBLINEAR_IDF = True #replace TF with 1 + log(TF)
        transformer = TfidfTransformer(norm = NORM,smooth_idf = SMOOTH_IDF,sublinear_tf = True)
        #get the bag-of-words from the vectorizer and
        #then use TFIDF to limit the tokens found throughout the text 
        tfidf = transformer.fit_transform(bag_of_words)
        
        return tfidf, features
    else:
        return bag_of_words, features

In [13]:
#create a bag of words with PROCESSED corpus
bag_of_words, features = create_bag_of_words(processed_corpus)

In [14]:
features

[u'all',
 u'and',
 u'best',
 u'car',
 u'cars',
 u'consumer',
 u'cr',
 u'empty',
 u'for',
 u'health',
 u'in',
 u'insurance',
 u'member',
 u'more',
 u'new',
 u'news',
 u'of',
 u'on',
 u'product',
 u'products',
 u'rated',
 u'ratings',
 u'react',
 u'reports',
 u'reviews',
 u'safety',
 u'text',
 u'the',
 u'to',
 u'top',
 u'you']

In [15]:
def get_word_counts( bag_of_words, feature_names ):

    """
    Get the ordered word counts from a bag_of_words
    
    Parameters
    ----------
    bag_of_words: obj
        scipy sparse matrix from CounterVectorizer
    feature_names: ls
        list of words
        
    Returns
    -------
    word_counts: dict
        Dictionary of word counts
    """

    # convert bag of words to array
    np_bag_of_words = bag_of_words.toarray()
    
    # calculate word count.
    word_count = np.sum(np_bag_of_words,axis=0)
    
    # convert to flattened array.
    np_word_count = np.asarray(word_count).ravel()
    
    # create dict of words mapped to count of occurrences of each word.
    dict_word_counts = dict( zip(feature_names, np_word_count) )
    
    # Create ordered dictionary
    orddict_word_counts = OrderedDict( sorted(dict_word_counts.items(), key=lambda x: x[1], reverse=True), )
    
    return orddict_word_counts

In [16]:
get_word_counts(bag_of_words, features)

OrderedDict([(u'and', 44),
             (u'react', 42),
             (u'the', 41),
             (u'to', 33),
             (u'text', 32),
             (u'of', 32),
             (u'cr', 29),
             (u'all', 22),
             (u'for', 21),
             (u'on', 21),
             (u'car', 19),
             (u'in', 18),
             (u'best', 18),
             (u'cars', 16),
             (u'more', 15),
             (u'reviews', 14),
             (u'rated', 13),
             (u'health', 13),
             (u'new', 13),
             (u'products', 13),
             (u'consumer', 13),
             (u'you', 12),
             (u'product', 12),
             (u'ratings', 11),
             (u'top', 10),
             (u'member', 10),
             (u'empty', 10),
             (u'reports', 9),
             (u'insurance', 8),
             (u'safety', 8),
             (u'news', 8)])

In [17]:
processed_bag_of_words, processed_features = create_bag_of_words(processed_corpus, stop_words = eng_stopwords)
dict_processed_word_counts = get_word_counts(processed_bag_of_words, processed_features)
dict_processed_word_counts

OrderedDict([(u'react', 42),
             (u'text', 32),
             (u'cr', 29),
             (u'car', 19),
             (u'best', 18),
             (u'cars', 16),
             (u'reviews', 14),
             (u'rated', 13),
             (u'products', 13),
             (u'new', 13),
             (u'health', 13),
             (u'consumer', 13),
             (u'product', 12),
             (u'ratings', 11),
             (u'top', 10),
             (u'member', 10),
             (u'empty', 10),
             (u'reports', 9),
             (u'safety', 8),
             (u'news', 8),
             (u'insurance', 8)])

In [18]:
df = pd.DataFrame.from_dict(dict_processed_word_counts, orient='index')

In [19]:
df.index.rename('Words', inplace=True)
df.columns = ['Frequency']

In [20]:
df

Unnamed: 0_level_0,Frequency
Words,Unnamed: 1_level_1
react,42
text,32
cr,29
car,19
best,18
cars,16
reviews,14
rated,13
products,13
new,13
