In [1]:
import os
import re

import json
import glob
import time
import pandas as pd
from collections import Counter
from tqdm import tqdm
from zipfile import ZipFile

In [2]:
zipfile_news = os.path.join(os.path.expanduser('~'), 'Downloads', 'archive.zip')
decompress_dir = os.path.join(os.path.expanduser('~'), 'Downloads', 'news')

In [3]:
# extract zipfile to 'news' folder in Download folder
with ZipFile(zipfile_news, 'r') as zipObj:
    if os.path.exists(decompress_dir):
        print('Folder already exists. skip decompressing')
    else:
        zipObj.extractall(decompress_dir)

Folder already exists. skip decompressing


In [4]:
lst_newsjson_dir = glob.glob(r'C:\Users\byeun\Downloads\news\*')

In [5]:
def get_words(text):
    return re.compile('\w+').findall(text)

In [16]:
for newsjson_dir in lst_newsjson_dir:
    lst_jsonfile = glob.glob(newsjson_dir + r'\*.json')

    dic_all_news = {}
    dic_all_thread = {}
    dic_all_social = {}
    dic_all_entity = {}
    dic_all_news_text = {}
    for i, jsonfile in enumerate(tqdm(lst_jsonfile)):
        
        dic_news = json.load(open(jsonfile, 'rt', encoding='utf-8'))
        uuid = dic_news.pop('uuid')
        dic_all_news[uuid] = dic_news
        
        dic_thread = dic_news.pop('thread')
      
        news_text = dic_news.pop('text')
        news_word = get_words(news_text)
        news_word_cnt = Counter(news_word).total()
        
        dic_news['word_cnt'] = news_word_cnt

        dic_all_social[uuid] = dic_thread.pop('social')
        dic_all_thread[uuid] = dic_thread
        dic_all_entity[uuid] = dic_news.pop('entities')

        dic_all_news[uuid] = dic_news

        dic_all_news_text[uuid] = {'text': news_text} 

        if i == 10000:
            break

    break

 17%|█▋        | 10000/57802 [00:30<02:27, 324.73it/s]


### News publishment summary

In [93]:
dic_uuid_date = {}
lst_publish_date = []
for uuid, dic_news in dic_all_news.items():
    dic_uuid_date[uuid] = pd.to_datetime(dic_news['published']).strftime('%Y-%m-%d')
    lst_publish_date.append([dic_news['published'], dic_news['language']])

In [88]:
ts_publish_date = pd.DataFrame(lst_publish_date)
ts_publish_date.columns = ['pubdate','language']
ts_publish_date['count'] = 1

ts_publish_date.set_index(pd.PeriodIndex(ts_publish_date['pubdate'],freq='D'), inplace=True)

# drop pubdate column
ts_publish_date.drop('pubdate', axis=1, inplace=True)

#### Number of news by month

In [71]:
# convert timestamp to date format
ts_publish_date.resample('M').count()

Unnamed: 0_level_0,language,count
pubdate,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-12,65,65
2018-01,9936,9936


#### Number of news by language

In [72]:
ts_publish_date['language'].value_counts()

english    10001
Name: language, dtype: int64

### Social media summary

In [94]:
dic_social_cnt = {}
for uuid, dic_social in dic_all_social.items():
    pubdate = dic_uuid_date[uuid]
    for sns, dic_event in dic_social.items():
        for event_type, event_cnt in dic_event.items():

            if not dic_social_cnt.get(pubdate):
                dic_social_cnt[pubdate] = {}

            if not dic_social_cnt[pubdate].get(sns):
                dic_social_cnt[pubdate][sns] = {}
            
            if not dic_social_cnt[pubdate][sns].get(event_type):
                dic_social_cnt[pubdate][sns][event_type] = 0
            
            dic_social_cnt[pubdate][sns][event_type] += event_cnt

In [159]:
lst_df_social = []
for pubdate, x in dic_social_cnt.items():
    df = pd.DataFrame(x)
    df['pubdate'] = pubdate
    lst_df_social.append(df)

df_social = pd.concat(lst_df_social)
df_social.index.name = 'event_type'
df_social.reset_index(inplace=True)
df_social.set_index('pubdate', inplace=True, drop=True)
df_social.index = pd.PeriodIndex(df_social.index, freq='D')


In [149]:
df_social.sort_index().head(10)

Unnamed: 0_level_0,event_type,gplus,pinterest,vk,linkedin,facebook,stumbledupon
pubdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-12-07,comments,,,,,0,
2017-12-07,likes,,,,,0,
2017-12-07,shares,0.0,0.0,0.0,0.0,0,0.0
2017-12-08,shares,0.0,0.0,0.0,0.0,0,0.0
2017-12-08,likes,,,,,0,
2017-12-08,comments,,,,,0,
2017-12-14,shares,0.0,0.0,0.0,0.0,0,0.0
2017-12-14,likes,,,,,0,
2017-12-14,comments,,,,,0,
2017-12-15,shares,0.0,0.0,0.0,0.0,0,0.0


#### Number of shares by date

In [163]:
df_social[df_social.event_type == 'shares'].resample('W').sum(numeric_only=True)

Unnamed: 0_level_0,gplus,pinterest,vk,linkedin,facebook,stumbledupon
pubdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-12-04/2017-12-10,0.0,0.0,0.0,0.0,0,0.0
2017-12-11/2017-12-17,0.0,0.0,0.0,0.0,0,0.0
2017-12-18/2017-12-24,0.0,0.0,0.0,0.0,0,0.0
2017-12-25/2017-12-31,0.0,0.0,0.0,0.0,0,0.0
2018-01-01/2018-01-07,0.0,53.0,7.0,10759.0,89138,2.0
2018-01-08/2018-01-14,0.0,7.0,3.0,2440.0,7464,0.0
2018-01-15/2018-01-21,0.0,19.0,2.0,822.0,9923,47.0
2018-01-22/2018-01-28,0.0,15.0,5.0,2009.0,13432,1.0
2018-01-29/2018-02-04,0.0,7.0,0.0,965.0,3770,0.0


### Thread summary

In [167]:
df_thread = pd.DataFrame(dic_all_thread.values())
df_thread.iloc[0]    

site_full                                                  www.cnbc.com
main_image            https://fm.cnbc.com/applications/cnbc.com/reso...
site_section          http://www.cnbc.com/id/19746125/device/rss/rss...
section_title                               Top News and Analysis (pro)
url                   https://www.cnbc.com/2018/01/03/emerging-marke...
country                                                              US
domain_rank                                                       767.0
title                 Emerging markets are set for an even bigger ra...
performance_score                                                     0
site                                                           cnbc.com
participants_count                                                    0
title_full                                                             
spam_score                                                          0.0
site_type                                                       

In [168]:
df_thread.performance_score.value_counts()

0     9786
1       90
2       29
3       21
10      20
5       17
4       17
6        7
7        5
9        5
8        4
Name: performance_score, dtype: int64

In [169]:
df_thread.spam_score.value_counts()

0.000    8068
0.001     166
0.002      79
1.000      59
0.003      53
         ... 
0.803       1
0.818       1
0.202       1
0.428       1
0.889       1
Name: spam_score, Length: 591, dtype: int64

In [170]:
df_thread.site_type.value_counts()

news     7913
blogs    2088
Name: site_type, dtype: int64

In [171]:
df_thread.site.value_counts()

reuters.com    4488
cnbc.com       4464
wsj.com         683
fortune.com     366
Name: site, dtype: int64

In [172]:
df_thread.country.value_counts()

US    10001
Name: country, dtype: int64

### Entity summary

In [174]:
foo = [x.keys() for x in dic_all_entity.values()]

In [178]:
dic_all_entity[uuid]

{'persons': [{'name': 'fred katayama', 'sentiment': 'none'},
  {'name': 'macy', 'sentiment': 'none'}],
 'locations': [],
 'organizations': [{'name': 'j.c. penney', 'sentiment': 'negative'},
  {'name': 'sears', 'sentiment': 'negative'},
  {'name': 'reuters news agency | brand attribution guidelines | careers  reuters',
   'sentiment': 'none'},
  {'name': 'reuters.com', 'sentiment': 'none'},
  {'name': 'thomson reuters', 'sentiment': 'none'},
  {'name': 'reuters tv', 'sentiment': 'none'},
  {'name': 'reuters', 'sentiment': 'none'}]}