In [1]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt


In [2]:
class clean():
    ''' A class with methods to clean text scrapped from the web.
    '''
    def __init__(self, text):
        ''' Initializing the input of text
        '''
        self.text = text
     
    def removeNonAscii(self):
        ''' Removing ASCII characters
        '''
        self.text = "".join(i for i in self.text if  ord(i)<128)
        return self

    def make_lower_case(self):
        ''' Converting to lower case
        '''
        self.text = self.text.lower()
        return self

    def remove_stop_words(self):
        ''' Removing stop words
        '''
        self.text = self.text.split()
        stops = set(stopwords.words("english"))
        self.text = [w for w in self.text if not w in stops]
        self.text = " ".join(self.text)
        return self

    def remove_html(text):
        ''' Removing all html tag and keep content
        '''
        html_pattern = re.compile('<.*?>')
        self.text = html_pattern.sub(r'', self.text)
        return self

    def remove_punctuation(text):
        ''' Removing all punctuation
        '''
        tokenizer = RegexpTokenizer(r'\w+')
        self.text = tokenizer.tokenize(self.text)
        self.text = " ".join(self.text)
        return self

In [3]:
class prep():
    ''' A class to prep the text data for various applications.
    '''
    def __init__(self, text):
        ''' Initializing the input of text
        '''
        self.text = text
    
    def tokenize(self):
        ''' Tokenizing the text into a bag of words
        '''
        return word_tokenize(self.text)
    
    def stemming(self):
        ''' Getting the stem (root) of each words regadless of grammar
        '''
        stemmer = PorterStemmer()
        return [stemmer.stem(word) for word in self.tokenize()]
    
    def lemmatizing(self):
        ''' Getting the stem (root) of each words considering grammar
        '''
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(word) for word in self.tokenize()]
    
    def word_tagging(self):
        ''' Categoring the words wether they are noun, verb, etc.
        '''
        return nltk.pos_tag(self.tokenize())

In [4]:
class visual():
    ''' A class to visualize text using the bag of words
    '''
    def __init__(self, tokens):
        ''' Initializing the bag of words
        '''
        self.tokens = tokens
        
    def word_freq(self, reverse=True):
        ''' Obtain the word frequency in descending order. Could get the ascending by
        setting reverse to False
        '''
        freq = Counter(tokens)
        self.sorted_freq = dict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
        return self.sorted_freq
        
    def freq_plot(self, top=25):
        ''' Obtain a frequency bar chart showing the words and their equivalent count 
        within the text
        '''
        top_words = list(self.sorted_freq.keys())[:top]
        top_freq = list(self.sorted_freq.values())[:top]
        sns.barplot(y=top_words, x=top_freq)
        plt.show()
    
    def word_cloud(self):
        ''' Obtain a word cloud from the sorted frequency of the bag of words.
        '''
        wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, 
                          background_color='black', colormap='Set2', 
                          collocations=False, stopwords = STOPWORDS)
        wordcloud.generate_from_frequencies(self.sorted_freq)
        plt.figure(figsize=(12, 8))
        plt.imshow(wordcloud) 
        plt.axis("off")