In [77]:
import pandas as pd
from datetime import datetime
import calendar
import json
import os
from collections import defaultdict

In [53]:
pd.set_option('display.max_colwidth', -1)

The structure of data is as follows:

        ROOT/
            [MONTH]/
                [DAY]/
                    [COMPANY]/
                        [ARTICLE

In [20]:
def listjoin(*paths):
    path = os.path.join(*paths)
    yield from os.listdir(path)

ROOT = "data"
records = []
for month in listjoin(ROOT):
    for day in listjoin(ROOT, month):
        for company in listjoin(ROOT, month, day):
            for article in listjoin(ROOT, month, day, company):
                records.append((month, day, company, article))
            
len(records)

136566

In [17]:
metadata = pd.read_csv("_all_features_NELA2017dataset.csv")
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135602 entries, 0 to 135601
Columns: 284 entries, pid to  MoralityGeneral_title
dtypes: float64(279), int64(2), object(3)
memory usage: 293.8+ MB


In [57]:
metadata.head()

Unnamed: 0,pid,source,date,Happiness,HarmVirtue,HarmVice,FairnessVirtue,FairnessVice,IngroupVirtue,IngroupVice,...,HarmVice_title,FairnessVirtue_title,FairnessVice_title,IngroupVirtue_title,IngroupVice_title,AuthorityVirtue_title,AuthorityVice_title,PurityVirtue_title,PurityVice_title,MoralityGeneral_title
11995,AP--2017-04-17--Absences fitness atmosphere _ new ways to track schools,AP,4/17/2017,5.646,0.001107,0.0,0.0,0.0,0.002215,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11996,AP--2017-04-17--After US strikes Trumps Syria plan starts coming into view,AP,4/17/2017,5.324444,0.005714,0.013333,0.0,0.0,0.007619,0.000952,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11997,AP--2017-04-17--Gorsuch dives into the fray at first Supreme Court arguments,AP,4/17/2017,5.04,0.0,0.0,0.0,0.0,0.00161,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11998,AP--2017-04-17--Pentagons Mattis discussing war aims in Mideast this week,AP,4/17/2017,4.83625,0.0,0.013746,0.0,0.0,0.005155,0.0,...,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11999,AP--2017-04-17--Trump warns North Korea Gotta behave,AP,4/17/2017,5.102222,0.0,0.006431,0.0,0.0,0.002144,0.004287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


There are some articles for which there is no metadata.
These articles are listed below

In [54]:
articles = pd.DataFrame(records, columns=["month", "day", "company", "pid"])


articles.sort_values(by="pid", inplace=True)
articles.pid.head()

11995    AP--2017-04-17--Absences fitness atmosphere _ new ways to track schools.txt     
11996    AP--2017-04-17--After US strikes Trumps Syria plan starts coming into view.txt  
11997    AP--2017-04-17--Gorsuch dives into the fray at first Supreme Court arguments.txt
11998    AP--2017-04-17--Pentagons Mattis discussing war aims in Mideast this week.txt   
11999    AP--2017-04-17--Trump warns North Korea Gotta behave.txt                        
Name: pid, dtype: object

In [55]:
metadata.sort_values(by="pid", inplace=True)
metadata.pid.head()

11995    AP--2017-04-17--Absences fitness atmosphere _ new ways to track schools     
11996    AP--2017-04-17--After US strikes Trumps Syria plan starts coming into view  
11997    AP--2017-04-17--Gorsuch dives into the fray at first Supreme Court arguments
11998    AP--2017-04-17--Pentagons Mattis discussing war aims in Mideast this week   
11999    AP--2017-04-17--Trump warns North Korea Gotta behave                        
Name: pid, dtype: object

So the pids are the .txt files. To fix this we can apply a function

In [59]:
articles.pid = articles.pid.apply(lambda x: x.replace(".txt",""))
articles.pid.head(50)

11995    AP--2017-04-17--Absences fitness atmosphere _ new ways to track schools     
11996    AP--2017-04-17--After US strikes Trumps Syria plan starts coming into view  
11997    AP--2017-04-17--Gorsuch dives into the fray at first Supreme Court arguments
11998    AP--2017-04-17--Pentagons Mattis discussing war aims in Mideast this week   
11999    AP--2017-04-17--Trump warns North Korea Gotta behave                        
12000    AP--2017-04-17--Trump welcomes 21000 to his first Easter Egg Roll           
12001    AP--2017-04-17--White House defends transparency after visitor log reversal 
13082    AP--2017-04-18--Ivankas biz prospers as politics mixes with business        
13084    AP--2017-04-18--Pence US wont rest until N Korea gives up nuclear weapons   
13083    AP--2017-04-18--Pence says US stands by ally Japan on North Korea problem   
13085    AP--2017-04-18--Retailers try to grapple with polarizing Ivanka Trump brand 
13086    AP--2017-04-18--The Latest Japan says differe

Now we can properly index

In [90]:
def article_text(pid):
    source, date, title = pid.split("--")
    dt = datetime.strptime(date, "%Y-%m-%d")
    month_ix = dt.month  # april is start month
    month_name = str(month_ix - 3) + "_" + calendar.month_name[month_ix]
    file = os.path.join(ROOT, month_name, date, source, pid + ".txt")
    with open(file, "rb") as f:
        text = json.load(f)
        
    return text["content"]

In [94]:
CNN = metadata[metadata[" source"]=="CNN"]
CNN.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3707 entries, 102 to 134546
Columns: 284 entries, pid to  MoralityGeneral_title
dtypes: float64(279), int64(2), object(3)
memory usage: 8.1+ MB


In [95]:
FOX = metadata[metadata[" source"]=="Fox News"]
FOX.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1069 entries, 170 to 93432
Columns: 284 entries, pid to  MoralityGeneral_title
dtypes: float64(279), int64(2), object(3)
memory usage: 2.3+ MB


In [100]:
FOX.reset_index(inplace=True)
for i in range(len(FOX)):
    print(FOX.pid[i])

Fox News--2017-04-01--Company linked to Trump-Russian dossier has its own Kremlin connection US senator says
Fox News--2017-04-01--GOP congressman booed at Salt Lake City town hall
Fox News--2017-04-01--Hannity Media distracting Americans by going after Nunes
Fox News--2017-04-01--Jared Kushner and Ivanka Trump worth up to 700M in combined assets
Fox News--2017-04-01--Putin spokesman Moscows relationship with US may be worse than Cold War
Fox News--2017-04-01--Without Obama once-booming gun industry poised to shrink
Fox News--2017-04-02--EPAs Pruitt Trump will end overreach Obama made up regulatory power
Fox News--2017-04-02--Flynn initially failed to disclose Russia-linked payments on ethics form
Fox News--2017-04-02--Golf Summit Trump tees up talks ObamaCare with Paul Mulvaney
Fox News--2017-04-02--LGBTQ activists held dance party protest outside Ivanka Trumps DC home
Fox News--2017-04-02--McConnell vows Gorsuch confirmation this week says nuclear option in hands of Democrats
Fox New