In [4]:
import requests
from bs4 import BeautifulSoup as BSoup
from newspaper import Article 

class HtmlPageLoader(object):
    
    @staticmethod
    def getPageHtml(url):
        header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        return requests.get(url, headers=header ).content.decode() 
    
    @staticmethod
    def getPageArticle(url):
        art = Article(url)
        art.download()
        art.parse()
        return art #.text
    
    @staticmethod
    def getPageTable(url, tbl_id=None):
        html = HtmlPageLoader.getPageHtml(url) 
        if tbl_id:
            html = BSoup(html, 'html-parser')
            html = BSoupt.find_all(id=tbl_id )
            
        dfs = pd.read_html( html ) 
        
        return dfs

In [3]:
from nltk import tokenize, FreqDist, pos_tag, pos_tag_sents
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import string, re

##TODO: compare with nltk sentiment analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as Sentimentor 

class MyCorpora(object):        
        
    def __init__(self, lemmatize=False, remove_numbers=False, stop_words='english'):
        self.lemmatize=lemmatize
        self.remove_numbers = remove_numbers
        self.stop_words = stop_words
        
    ###--- OPTS: TODO: regex 
    ## 1. remove puncts in all    TODO: keep - and _ iff join words
    ## 2. remove numbers 
    def clean_nonwords(self, input_txt):
        res = [ w for w in input_txt if w not in string.punctuation]
        if self.remove_numbers:
            res = [w for w in res if w.isalpha() ] 
        if self.stop_words:
            res = [w for w in res if w not in stopwords.words('english') ]             
        return res
    
    @property
    def sentences(self):
        return tokenize.sent_tokenize( self.text )
    @property
    def words(self):
        return [w.lower().strip() for w in self.clean_nonwords( tokenize.word_tokenize( self.text ) )]
    
    @property
    def lemmas(self, remove_stops=False):
        lemm = WordNetLemmatizer()
        return [ wl for wl in [lemm.lemmatize( w ) for w in self.words ] if len(wl) > 2 ]
    @property 
    def vocab( self ):
        return sorted( set(self.words ) ) 
    @property
    def lexical_diversity(self):
        return len( self.vocab) / len(self.words )   
    @property
    def mean_word_length(self):
        return np.array( [len( w ) for w in self.vocab]).mean()   
    @property
    def mean_sentence_length(self):
        return np.array( [len( s ) for s in self.sentences]).mean()    
    @property    
    def freq_dist(self):
        return FreqDist( self.words )
    
    def top_n_words(self, top_n=None):
        return FreqDist( self.words ).most_common(top_n)
    
    def long_words(self, min_len=7, sort_by_freq=False):
        fdist = FreqDist(self.words )
        return [ (w, fdist[w]) for w in self.vocab if len(w) >= min_len ]
    
    def common_words(self, min_freq=30):        
        fdist = FreqDist(self.words )
        return [ (w, fdist[w]) for w in self.vocab if fdist[w] >= min_freq ]
    
    def sentences_with_word(self, word ):
        pass 
    
    # ref: https://github.com/cjhutto/vaderSentiment 
    # ref: https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f
    @property 
    def sentiment_score(self):
        sz = Sentimentor()
        return sz.polarity_scores(self.text )
    ## TODO: is naive at present 
    def pos_tag_stats(self):
        self.pos_stats = {
            'i_counts': 0,
            'axn_ratio': 0,
            'adv_ratio':0,
        }
        
        res = []
        for sent in self.sentences:
            tokenz = [ w.lower() for w in tokenize.word_tokenize(sent)]
            slen = len(tokenz)
            tagz = pos_tag(  tokenz ) 
            iz = np.array( [ 1 for w, t in tagz if t == 'NN' and w == 'i' ] ).sum()
            axnz = np.array([ 1 for w, t in tagz if t == 'VBD' ]).sum() / slen
            advz = np.array([ 1 for w, t in tagz if t == 'RB' ]).sum() / slen 
            res.append( (iz, axnz, advz) )
            
        self.pos_stats['i_counts'] = np.array([r[0] for r in res]).mean()
        self.pos_stats['axn_ratio'] = np.array([r[1] for r in res]).mean()
        self.pos_stats['adv_ratio'] = np.array([r[2] for r in res]).mean()
        
        return self.pos_stats
        
class UrlMixin():
    def __init__(self, url):
        self.url = url
        self.parseText( )
        super().__init__(self)
    
    def parseText(self):
        pass 
            


class MyCorporaText(MyCorpora):    
    def __init__(self, plain_text, lemmatize=False, remove_numbers=False, stop_words='english' ):
        super(MyCorporaText, self).__init__(lemmatize=lemmatize, remove_numbers=remove_numbers, stop_words='english')
        self.text = plain_text
        self.url = 'txt'    

        
        
class MyCorporaArticle(UrlMixin, MyCorpora ):    
    def __init__(self, src_text):
        super(MyCorporaArticle, self).__init__(src_text)
        
    def parseText(self):
        self.article = HtmlPageLoader.getPageArticle(self.url)
        self.text = self.article.text 

        
        
class MyCorporaTable(UrlMixin, MyCorpora):    
    def __init__(self, src_text, tbl_id=None, txt_col=None):
        self.tbl_id = tbl_id
        self.txt_col = txt_col
        super(MyCorporaTable, self).__init__(src_text )       
        
    def parseText(self):        
        self.dframe = pd.concat( HtmlPageLoader.getPageTable(self.url, tbl_id=self.tbl_id ) ) ##concat Vs use first tbl
        
        docs =  self.dframe[self.txt_col].values.tolist() if self.txt_col else  self.dframe.values.tolist()
        
        self.text = "\n".join( [ " ".join([str(w) for w in l if str(w).lower() not in ['nan',]]) for l in docs ] ) ##TODO better
        self.text = re.sub("[ \[\], ]", " ", self.text)
  
        