In [None]:
#instructions for how to build this using nbdev at https://nbdev.fast.ai/

In [None]:
#default_exp describe

# Describe loaded articles

> Takes a loader as an argument

In [None]:
#export
from newstrends import loader

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from collections import Counter
import numpy as np

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
class describer(loader.article_holder):
    "inherit everything from article_holder including init"
    
    subclass="describer"
    vectorizer=None
        

In [None]:
tmp = loader.article_holder()
assert(type(tmp)==loader.article_holder)

In [None]:
tmp.subclass

'article_holder'

In [None]:
tmp = describer()
tmp.subclass

'describer'

## Making sure that I can still use article_holder functionality

In [None]:
test_ah = describer()
test_ah.set_articleDir("../CoverageTrends")
try:
    test_ah.load_articles(publications=["newyorktimes"])
    assert ("quickReplace" in test_ah.df.columns)
except:
    assert False

In [None]:
#export
class describer(describer):
    "Add in vectorize"
    
    def fitVectorizer(self, vectorizer:CountVectorizer=CountVectorizer, ngram_range=(1,2), max_features=10000):
        try:
            _ = self.df[:1].quickReplace
        except:
            raise Exception("No article data found")
        
        self.vectorizer=vectorizer(stop_words=self.stopwords, ngram_range=ngram_range, max_features=max_features).fit(self.df.quickReplace)        
    
    def getTopNWords(self, topN = 10, lastDate=None, window=None, source=None):        
        " get topN important words for each publication "

        "check if properly formatted"
        if type(self.df) != pd.core.frame.DataFrame:
            raise Exception("Dataframe not loaded")
        if self.vectorizer == None:
            raise Exception("No vectorizer found")
        
        
        "Get Dataframe for source and time period"
        sources=source
        if source==None:
            sources=[x for x in self.df.source.unique()]
        df = self.df[self.df.source.isin(sources)]
        
        "get counts of features from count vectorizer"
        X = self.vectorizer.transform(df.quickReplace)
        vocab = list(self.vectorizer.get_feature_names())
        counts = X.sum(axis=0).A1
        counts = Counter(dict(zip(vocab, counts)))

        return counts.most_common(10)


Make sure exceptions thrown on empty

In [None]:
test_ah = describer()
try:
    test_ah.fitVectorizer()
    assert False
except:
    assert True

try loading in new york times data to test vectorizer

In [None]:
test = describer()
test.set_articleDir(path="../CoverageTrends")
test.load_articles(publications=["newyorktimes"])
test.fitVectorizer()

In [None]:
test.getTopNWords()


[('trump', 3378),
 ('time', 2846),
 ('citi', 2801),
 ('protest', 2312),
 ('coronavirus', 1773),
 ('pandem', 1610),
 ('presid', 1589),
 ('press', 1458),
 ('polic', 1428),
 ('death', 1403)]

In [None]:
#export
class describer(describer):
    "Now having cooccurances could be nice"
    
    def generateCoOccurances(self, pubList = ["newyorktimes", "foxnews", "washingtonpost", "cnn", "breitbart", "abcnews", "dailycaller"], verbose=False, topK:"int<100" = 20):
        " get cooccurances of terms in my df, up to 100"
        
        if type(self.df) != pd.core.frame.DataFrame:
            raise Exception("Dataframe not loaded")
        if self.vectorizer == None:
            raise Exception("No vectorizer found")
            
        vectorizer = CountVectorizer(stop_words=self.stopwords, max_features=10000).fit(self.df.quickReplace)   

        # get the transformed DF
        X = vectorizer.transform(self.df.quickReplace)
        X[X > 0] = 1

        coOccurance = (X.T * X)
        coOccurance.setdiag(0)
        d = coOccurance.todense()
        
        checkLength = topK*2
        if checkLength > 100:
            checkLength = 100

        top_prs = np.dstack(np.unravel_index(np.argpartition(d.ravel(),-checkLength)[:,-checkLength:],d.shape))[0]

        vals = []
        keys = vectorizer.get_feature_names()
        for pair in top_prs:
            newEntry = [keys[pair[0]], keys[pair[1]]]
            if newEntry not in vals:
                vals.append(newEntry)
            if len(vals) >= topK:
                break

        #So now for each day for each time period I want to math out the co-occurances!
        return vals


In [None]:
#k, so some work to do here on making this work with this framework.

def getRecentInterestingGroups(self, pubList = ["newyorktimes", "foxnews", "washingtonpost", "cnn", "breitbart", "abcnews", "dailycaller"], outdir = "docs"):
        vals =self.generateCoOccurances(dateStart=(datetime.datetime.today()-datetime.timedelta(days=1)).strftime("%Y%m%d"), )

        grps = {}
        idx = 0
        for val in vals:
            found = False
            for grp in grps:
                if val[0] in grps[grp]:
                    grps[grp].add(val[1])
                    found=True
                    continue
                elif val[1] in grps[grp]:
                    grps[grp].add(val[0])
                    found=True
                    continue

            if not found:
                grps[idx] = set()
                grps[idx].add(val[0])
                grps[idx].add(val[1])
                idx +=1

        myTargets = [x[1] for x in grps.items() if len(x[1]) < 4]
        print("targets: {}".format(myTargets))

        self.loadArticles(pubList=pubList)
        print("building bigdf2")
        self.buildBigDF()

        myTime = datetime.datetime.now(tz=timezone.utc).strftime('%Y%m%d-%H%M')
        myTime = myTime[:-1]
        myTime +="0"

        plt.close('all') #in case of zombies or something
        os.makedirs("{}/img".format(outdir), exist_ok=True)
        os.makedirs("{}/timeseries".format(outdir), exist_ok=True)

        for target_words in myTargets:
            print("making df for {}".format(target_words))
            tmp = self.bigdf[self.bigdf.tokens.apply(lambda x: len(set(x))==len(target_words|set(x)))]

            print(len(tmp))

            tmp.date = pd.to_datetime(tmp.date)
            tmp = tmp.groupby(["source", "date"]).count()["quickReplace"]

            print("making source series for {}".format(target_words))
            tmp.unstack(level=0).fillna(0).to_pickle("{}/timeseries/{}.pkl".format(outdir, "+".join(target_words)))

            print("making plot for {}".format(target_words))
            ax = tmp.unstack(level=0).fillna(0).plot(title="Frontpage mentions of {}".format("+".join(target_words)), figsize=(8,8))
            ax.set_ylabel("frontpage mentions at time")

            deleteMe = [oldFile for oldFile in os.listdir("{}/img".format(outdir)) if oldFile.endswith("+".join(target_words)+".jpg")]
            for oldFile in deleteMe:
                os.remove("docs/img/{}".format(oldFile))

            ax.figure.savefig("{}/img/{}_{}.jpg".format(outdir, myTime, "+".join(target_words)))
            plt.close('all') #close all figures


In [None]:
test = describer()
test.set_articleDir(path=".")
test.load_articles(publications=["newyorktimes"])
test.fitVectorizer()

assert(len(test.generateCoOccurances(topK=15))==15)