In [1]:
import os

In [5]:
# please download the data using the link in README
if not os.path.exists('data/aylien_covid_news_data.tar.gz'):
    print('Please download the data first.')
    exit()

if not os.path.exists('data/aylien_covid_news_data.jsonl'):
    os.system('tar -xvf data/aylien_covid_news_data.tar.gz')
    os.system('rm README.txt')

aylien_covid_news_data.jsonl
README.txt


In [9]:
import jsonlines
import pandas as pd
import numpy as np

# select the news articles from only these six sources

target_sources = ['CNN', 'Fox News', 'Huffington Post US', 
                  'Breitbart', 'The New York Times', 'New York Post']

aylien_json_path = 'aylien_covid_news_data.jsonl'
articles = []
i = 0
with jsonlines.open(aylien_json_path) as reader:
    for obj in reader:
        source =  obj['source']['name']
        if source in target_sources:
            summary = obj['summary']['sentences']
            country = 'US'
            info = (obj['title'], source, obj['published_at'], obj['language'], country, tuple(summary))
            articles.append(info)
            i += 1
            if i % 2000 == 0:
                print(i, source)
data_raw = pd.DataFrame(articles, columns=['title', 'source', 'date', 'language', 'country', 'summary'])

2000 CNN
4000 CNN
6000 New York Post
8000 CNN
10000 The New York Times
12000 Breitbart
14000 The New York Times
16000 CNN
18000 Fox News
20000 CNN
22000 CNN
24000 The New York Times
26000 Fox News
28000 CNN
30000 Huffington Post US
32000 CNN
34000 CNN
36000 Fox News
38000 Fox News
40000 Fox News
42000 New York Post
44000 CNN
46000 Huffington Post US
48000 CNN
50000 Fox News
52000 CNN
54000 New York Post
56000 New York Post
58000 CNN
60000 New York Post
62000 Huffington Post US
64000 CNN
66000 The New York Times
68000 Fox News
70000 CNN
72000 Fox News
74000 New York Post


In [10]:
import copy
data = copy.deepcopy(data_raw)
data.head()

Unnamed: 0,title,source,date,language,country,summary
0,Celebrity livestreams you can watch while soci...,Fox News,2020-04-05 23:55:51+00:00,en,US,(REESE WITHERSPOON SAYS SHE'S TRYING 'TO BE PA...
1,US stock futures rise and oil drops 8.5% after...,CNN,2020-04-05 23:49:03+00:00,en,US,"(US stock futures were up Sunday evening., But..."
2,Washington governor slams Trump administration...,CNN,2020-04-05 23:49:03+00:00,en,US,((CNN) Washington state Gov. Jay Inslee decrie...
3,Rita Wilson performs for first time after coro...,CNN,2020-04-05 23:49:03+00:00,en,US,"(Wilson was filmed singing ""The Star-Spangled ..."
4,Lesson of Maurice Stokes-Jack Twyman bond reso...,New York Post,2020-04-05 23:42:10+00:00,en,US,(The celebration was going to culminate in the...


In [11]:
data.shape

(75198, 6)

In [12]:
# rename the sources
source_map = {'CNN': 'cnn',
             'Fox News': 'fox',
             'Huffington Post US':'huff',
             'Breitbart': 'breit',
             'The New York Times': 'nyt',
             'New York Post': 'nyp'}
data['source'] = data['source'].map(source_map)

In [13]:
data.head()

Unnamed: 0,title,source,date,language,country,summary
0,Celebrity livestreams you can watch while soci...,fox,2020-04-05 23:55:51+00:00,en,US,(REESE WITHERSPOON SAYS SHE'S TRYING 'TO BE PA...
1,US stock futures rise and oil drops 8.5% after...,cnn,2020-04-05 23:49:03+00:00,en,US,"(US stock futures were up Sunday evening., But..."
2,Washington governor slams Trump administration...,cnn,2020-04-05 23:49:03+00:00,en,US,((CNN) Washington state Gov. Jay Inslee decrie...
3,Rita Wilson performs for first time after coro...,cnn,2020-04-05 23:49:03+00:00,en,US,"(Wilson was filmed singing ""The Star-Spangled ..."
4,Lesson of Maurice Stokes-Jack Twyman bond reso...,nyp,2020-04-05 23:42:10+00:00,en,US,(The celebration was going to culminate in the...


In [14]:
# drop the duplicate news
data = data.drop_duplicates(subset=['title']).drop_duplicates(subset=['summary'])

In [15]:
# normalize the text
def prepocess_summary(a):
    import unicodedata, re
    a = ' '.join(a)
    a = re.sub(' +', ' ', a)
    a = a.replace('\n', ' ')
    a = unicodedata.normalize("NFKD", a)
    return a
data['summary'] = data['summary'].apply(prepocess_summary)

# drop the rows with null entries
data = data[data['summary'] != '']
data = data[data['title'] != '']

In [16]:
data.head()

Unnamed: 0,title,source,date,language,country,summary
0,Celebrity livestreams you can watch while soci...,fox,2020-04-05 23:55:51+00:00,en,US,REESE WITHERSPOON SAYS SHE'S TRYING 'TO BE PAT...
1,US stock futures rise and oil drops 8.5% after...,cnn,2020-04-05 23:49:03+00:00,en,US,US stock futures were up Sunday evening. But o...
2,Washington governor slams Trump administration...,cnn,2020-04-05 23:49:03+00:00,en,US,(CNN) Washington state Gov. Jay Inslee decried...
3,Rita Wilson performs for first time after coro...,cnn,2020-04-05 23:49:03+00:00,en,US,"Wilson was filmed singing ""The Star-Spangled B..."
4,Lesson of Maurice Stokes-Jack Twyman bond reso...,nyp,2020-04-05 23:42:10+00:00,en,US,The celebration was going to culminate in the ...


In [17]:
# index the articles
data.insert(0, 'idx', np.arange(data.shape[0]))
data.head()

Unnamed: 0,idx,title,source,date,language,country,summary
0,0,Celebrity livestreams you can watch while soci...,fox,2020-04-05 23:55:51+00:00,en,US,REESE WITHERSPOON SAYS SHE'S TRYING 'TO BE PAT...
1,1,US stock futures rise and oil drops 8.5% after...,cnn,2020-04-05 23:49:03+00:00,en,US,US stock futures were up Sunday evening. But o...
2,2,Washington governor slams Trump administration...,cnn,2020-04-05 23:49:03+00:00,en,US,(CNN) Washington state Gov. Jay Inslee decried...
3,3,Rita Wilson performs for first time after coro...,cnn,2020-04-05 23:49:03+00:00,en,US,"Wilson was filmed singing ""The Star-Spangled B..."
4,4,Lesson of Maurice Stokes-Jack Twyman bond reso...,nyp,2020-04-05 23:42:10+00:00,en,US,The celebration was going to culminate in the ...


In [18]:
data['date'] = pd.to_datetime(data['date'])
data['month'] = data['date'].dt.month
data['week'] = data['date'].dt.week
data.head()

  data['week'] = data['date'].dt.week


Unnamed: 0,idx,title,source,date,language,country,summary,month,week
0,0,Celebrity livestreams you can watch while soci...,fox,2020-04-05 23:55:51+00:00,en,US,REESE WITHERSPOON SAYS SHE'S TRYING 'TO BE PAT...,4,14
1,1,US stock futures rise and oil drops 8.5% after...,cnn,2020-04-05 23:49:03+00:00,en,US,US stock futures were up Sunday evening. But o...,4,14
2,2,Washington governor slams Trump administration...,cnn,2020-04-05 23:49:03+00:00,en,US,(CNN) Washington state Gov. Jay Inslee decried...,4,14
3,3,Rita Wilson performs for first time after coro...,cnn,2020-04-05 23:49:03+00:00,en,US,"Wilson was filmed singing ""The Star-Spangled B...",4,14
4,4,Lesson of Maurice Stokes-Jack Twyman bond reso...,nyp,2020-04-05 23:42:10+00:00,en,US,The celebration was going to culminate in the ...,4,14


In [19]:
# save the news dataframe
os.makedirs('data', exist_ok=True)
data.to_csv('data/df_news.csv', index=False)

In [20]:
# count the # of articles in each month
from collections import Counter
Counter(data['month'])

Counter({4: 17702, 3: 15231, 2: 2171, 1: 736, 5: 12606, 7: 9706, 6: 8216})

In [21]:
# count the # of articles from each source
Counter(data['source'])

Counter({'fox': 18314,
         'cnn': 13425,
         'nyp': 13245,
         'nyt': 8423,
         'breit': 7469,
         'huff': 5492})

In [17]:
# count the # of articls of different sources in different mon
for source in ['cnn','fox','huff','breit','nyt','nyp']:
    for month in [1,2,3,4,5,6,7]:
        print(source, month, ((data['source']==source) & (data['month']==month)).sum())

cnn 1 232
cnn 2 589
cnn 3 2958
cnn 4 3293
cnn 5 2444
cnn 6 1789
cnn 7 2120
fox 1 156
fox 2 504
fox 3 3938
fox 4 5148
fox 5 3616
fox 6 2367
fox 7 2585
huff 1 17
huff 2 74
huff 3 1237
huff 4 1450
huff 5 1185
huff 6 701
huff 7 828
breit 1 93
breit 2 240
breit 3 1918
breit 4 2164
breit 5 1353
breit 6 777
breit 7 924
nyt 1 94
nyt 2 359
nyt 3 2117
nyt 4 2177
nyt 5 1730
nyt 6 849
nyt 7 1097
nyp 1 144
nyp 2 405
nyp 3 3063
nyp 4 3470
nyp 5 2278
nyp 6 1733
nyp 7 2152
