In [None]:
#instructions for how to build this using nbdev at https://nbdev.fast.ai/

In [None]:
# default_exp loader

# Load articles

> Loads and holds news articles

In [None]:
#export
import pandas as pd
import os, datetime, re
from nltk.stem.snowball import SnowballStemmer


In [None]:
#hide
from nbdev.showdoc import *
import unittest

# Load News Articles

- Default position is that the news articles are in https://github.com/brockmanmatt/CoverageTrends
- However, should add additional capabiltiies to pull different sets of articles

## I can add new methods to my class by just inheriting and overwriting the old class essentially, cool!

In [None]:
#export
class article_holder():
    "Basic unit to keep load and analze my articles"
    
    def __init__(self):
        self.articleDir = None

In [None]:
test_ah = article_holder()
assert(test_ah.articleDir==None)

In [None]:
#export
class article_holder(article_holder):

    def set_articleDir(self, path): self.articleDir = path
    def get_articleDir(self): return self.articleDir



In [None]:
test_ah = article_holder()
test_ah.set_articleDir(path=".")
assert(test_ah.get_articleDir()==".")

## Available Publication at the moment

In [None]:
os.listdir("../CoverageTrends/archived_links")

['yahoonews',
 'chicagotribune',
 'nbcnews',
 'foxnews',
 'forbes',
 'cnbc',
 'sanfransiscochronicle',
 'bostonglobe',
 '.DS_Store',
 'newyorktimes',
 'nydailynews',
 'reuters',
 'bbc',
 'arstechnica',
 'breitbart',
 'washingtonpost',
 'nypost',
 'dailycaller',
 'aljazeera',
 'npr',
 'rt',
 'slate',
 'sputnik',
 'politico',
 'cnn',
 'buzzfeed',
 'abcnews',
 'livescience',
 'techcrunch',
 'dailybeast',
 'newyorker',
 'axios',
 'nationalreview',
 'businessinsider',
 'theatlantic',
 'fortune']

In [None]:
#export
def CoverageTrendsLoader(publications:[str] = [], dateStart:str=None, dateEnd:str=None, lastN:int=None, verbose=False) -> []:
    
    """
    Turns CSV of scraped headlines from CoverageTrends into a Pandas Dataframe.
    Expects that CoverageTrends (https://github.com/brockmanmatt/CoverageTrends) is cloned into ../CoverageTrends
    
    Parameters
    
    publications: list of publications to try to pull from CoverageTrends CSV, all if []
    
    dateStart: String YYYYMMDD for first date of CSV to load for each publication
    
    dateEnd: String YYYYMMDD for last date of CSV to load for each publication
    
    lastN: get max (available days, lastN) days
    
    """
    
    "Engine to load articles from CoverageTrends GitHub repo"
    if "CoverageTrends" not in os.listdir(".."):
        missingCoverageTrends="CoverageTrends engine requires CoverageTrends"
        missingCoverageTrends+="\nPlease clone https://github.com/brockmanmatt/CoverageTrends to use this option"
        raise Exception(missingCoverageTrends)
    
    "Make list of publications that have scraped lists"
    availablePublications = [x for x in os.listdir("../CoverageTrends/archived_links") if x.find(".") ==-1]
    
    "If publications are limited, then only go with those"
    if len(publications) > 0:
        availablePublications = [x for x in publications if x in availablePublications]
        
    loaded_articles = []    
    
    "Loop through each publisher in CoverageTrends and load each day"
    for publisher in availablePublications:
        
        csvPaths = []
        
        pubPath = "{}/{}".format("../CoverageTrends/archived_links", publisher)
        for month in os.listdir(pubPath):
            if month.find(".") > -1:
                continue
            monthPath = "{}/{}".format(pubPath, month)
            for day in os.listdir(monthPath):
                if dateStart != None:
                    if int(day.split("_")[1][:-4]) < int(dateStart):
                        continue
                if dateEnd != None:
                    if int(day.split("_")[1][:-4]) > int(dateEnd):
                        continue
                csvPaths.append("{}/{}".format(monthPath, day))
        
        csvPaths = sorted(csvPaths)
        
        if lastN != None:
            csvPaths = csvPaths[-lastN:]
        
        csvPaths = pd.concat([pd.read_csv(x) for x in csvPaths], ignore_index=True)
        csvPaths["source"] = publisher
        loaded_articles.append(csvPaths)

    return pd.concat(loaded_articles).fillna("")
    

In [None]:
assert(len(set([x[:8] for x in CoverageTrendsLoader(publications=["newyorktimes"], lastN=3).date.unique()]))==3)

In [None]:
#export
class article_holder(article_holder):
    "gives article_holder ability to load articles for publication between dateStart and dateEnd"
    
    def load_articles(self, engine=CoverageTrendsLoader, publications:[str] = [], dateStart:str=None, dateEnd:str=None, lastN:int=None, verbose=False) -> []:
        if self.articleDir == None:
            raise Exception("holder missing path")

        self.df = engine(publications=publications, dateStart=dateStart, dateEnd=dateEnd, lastN=lastN, verbose=verbose)
        
        tmp = pd.DataFrame([self.df.text.unique()]).T
        tmp.columns=["text"]
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        tmp["quickReplace"] = tmp["text"].fillna("").apply(lambda x: re.sub('[^a-z]+', " ", x.lower()))
        tmp["tokens"] = tmp["quickReplace"].apply(lambda x: [stemmer.stem(y) for y in x.split() if len (y) > 0])
        tmp["quickReplace"] = tmp["tokens"].apply(lambda x: " ".join(x))

        self.df = self.df.merge(tmp)

In [None]:
test_ah = article_holder()
try:
    test_ah.load_articles(publications=["newyorktimes"])
    assert False
except:
    assert True

In [None]:
test_ah.set_articleDir(".")
try:
    test_ah.load_articles(publications=["newyorktimes"])
    assert True
except:
    assert False