In [26]:
import os
import re

import json
import glob
import time
import pickle
import pandas as pd
from collections import Counter
from tqdm import tqdm
from zipfile import ZipFile

In [2]:
zipfile_news = os.path.join(os.path.expanduser('~'), 'Downloads', 'archive.zip')
decompress_dir = os.path.join(os.path.expanduser('~'), 'Downloads', 'news')

In [3]:
# extract zipfile to 'news' folder in Download folder
with ZipFile(zipfile_news, 'r') as zipObj:
    if os.path.exists(decompress_dir):
        print('Folder already exists. skip decompressing')
    else:
        zipObj.extractall(decompress_dir)

Folder already exists. skip decompressing


In [2]:
# lst_newsjson_dir = glob.glob(r'C:\Users\byeun\Downloads\news\*')

# high spec PC on my desk
lst_newsjson_dir = glob.glob(r'D:\data\kaggle_news\*')


In [3]:
def get_words(text):
    return re.compile('\w+').findall(text)

In [24]:
dic_all_news = {}
dic_all_thread = {}
dic_all_social = {}
dic_all_entity = {}
dic_all_news_text = {}

for newsjson_dir in lst_newsjson_dir:
    lst_jsonfile = glob.glob(newsjson_dir + r'\*.json')

    for i, jsonfile in enumerate(tqdm(lst_jsonfile)):
        
        dic_news = json.load(open(jsonfile, 'rt', encoding='utf-8'))
        uuid = dic_news.pop('uuid')
        dic_all_news[uuid] = dic_news
        
        dic_thread = dic_news.pop('thread')
      
        news_text = dic_news.pop('text')
        news_word = get_words(news_text)
        news_word_cnt = len(news_word)
        
        dic_news['word_cnt'] = news_word_cnt

        dic_all_social[uuid] = dic_thread.pop('social')
        dic_all_thread[uuid] = dic_thread
        dic_all_entity[uuid] = dic_news.pop('entities')

        dic_all_news[uuid] = dic_news

        dic_all_news_text[uuid] = {'text': news_text} 

100%|██████████| 57802/57802 [16:26<00:00, 58.57it/s]
100%|██████████| 64592/64592 [17:39<00:00, 60.94it/s]
100%|██████████| 57456/57456 [15:24<00:00, 62.13it/s]
100%|██████████| 63245/63245 [17:06<00:00, 61.62it/s] 
100%|██████████| 63147/63147 [17:09<00:00, 61.31it/s]


In [25]:
len(dic_all_news)

306188

In [29]:
# save dic_all_news to pickle file
processed_dir = r'D:\data\kaggle_news'

In [28]:
# save dic_all_thread, dic_all_social, dic_all_entity, dic_all_news_text to pickle

pickle.dump(dic_all_news, open('dic_all_news.pkl', 'wb'))
pickle.dump(dic_all_thread, open('dic_all_thread.pkl', 'wb'))
pickle.dump(dic_all_social, open('dic_all_social.pkl', 'wb'))
pickle.dump(dic_all_entity, open('dic_all_entity.pkl', 'wb'))
pickle.dump(dic_all_news_text, open('dic_all_news_text.pkl', 'wb'))

### News publishment summary

In [7]:
dic_uuid_date = {}
lst_publish_date = []
for uuid, dic_news in dic_all_news.items():
    dic_uuid_date[uuid] = pd.to_datetime(dic_news['published']).strftime('%Y-%m-%d')
    lst_publish_date.append([dic_news['published'], dic_news['language']])

In [8]:
ts_publish_date = pd.DataFrame(lst_publish_date)
ts_publish_date.columns = ['pubdate','language']
ts_publish_date['count'] = 1

ts_publish_date.set_index(pd.PeriodIndex(ts_publish_date['pubdate'],freq='D'), inplace=True)

# drop pubdate column
ts_publish_date.drop('pubdate', axis=1, inplace=True)

#### Number of news by month

In [9]:
# convert timestamp to date format
ts_publish_date.resample('M').count()

Unnamed: 0_level_0,language,count
pubdate,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-12,75,75
2018-01,57704,57704
2018-02,23,23


#### Number of news by language

In [10]:
ts_publish_date['language'].value_counts()

language
english    57802
Name: count, dtype: int64

### Social media summary

In [11]:
dic_social_cnt = {}
for uuid, dic_social in dic_all_social.items():
    pubdate = dic_uuid_date[uuid]
    for sns, dic_event in dic_social.items():
        for event_type, event_cnt in dic_event.items():

            if not dic_social_cnt.get(pubdate):
                dic_social_cnt[pubdate] = {}

            if not dic_social_cnt[pubdate].get(sns):
                dic_social_cnt[pubdate][sns] = {}
            
            if not dic_social_cnt[pubdate][sns].get(event_type):
                dic_social_cnt[pubdate][sns][event_type] = 0
            
            dic_social_cnt[pubdate][sns][event_type] += event_cnt

In [12]:
lst_df_social = []
for pubdate, x in dic_social_cnt.items():
    df = pd.DataFrame(x)
    df['pubdate'] = pubdate
    lst_df_social.append(df)

df_social = pd.concat(lst_df_social)
df_social.index.name = 'event_type'
df_social.reset_index(inplace=True)
df_social.set_index('pubdate', inplace=True, drop=True)
df_social.index = pd.PeriodIndex(df_social.index, freq='D')


In [13]:
df_social.sort_index().head(10)

Unnamed: 0_level_0,event_type,gplus,pinterest,vk,linkedin,facebook,stumbledupon
pubdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-12-07,comments,,,,,0,
2017-12-07,likes,,,,,0,
2017-12-07,shares,0.0,0.0,0.0,0.0,0,0.0
2017-12-08,shares,0.0,0.0,0.0,0.0,0,0.0
2017-12-08,comments,,,,,0,
2017-12-08,likes,,,,,0,
2017-12-10,comments,,,,,0,
2017-12-10,likes,,,,,0,
2017-12-10,shares,0.0,0.0,0.0,0.0,0,0.0
2017-12-13,likes,,,,,0,


#### Number of shares by date

In [14]:
df_social[df_social.event_type == 'shares'].resample('W').sum(numeric_only=True)

Unnamed: 0_level_0,gplus,pinterest,vk,linkedin,facebook,stumbledupon
pubdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-12-04/2017-12-10,0.0,0.0,0.0,0.0,0,0.0
2017-12-11/2017-12-17,0.0,0.0,0.0,0.0,0,0.0
2017-12-18/2017-12-24,0.0,0.0,0.0,0.0,0,0.0
2017-12-25/2017-12-31,0.0,0.0,0.0,0.0,0,0.0
2018-01-01/2018-01-07,0.0,54.0,7.0,11129.0,95653,2.0
2018-01-08/2018-01-14,0.0,89.0,14.0,17916.0,121651,6.0
2018-01-15/2018-01-21,0.0,122.0,5.0,17728.0,102892,63.0
2018-01-22/2018-01-28,0.0,115.0,5.0,18063.0,129129,1.0
2018-01-29/2018-02-04,0.0,99.0,2.0,12879.0,62587,0.0


### Thread summary

In [15]:
df_thread = pd.DataFrame(dic_all_thread.values())
df_thread.iloc[0]    

site_full                                                  www.cnbc.com
main_image            https://fm.cnbc.com/applications/cnbc.com/reso...
site_section          http://www.cnbc.com/id/19746125/device/rss/rss...
section_title                               Top News and Analysis (pro)
url                   https://www.cnbc.com/2018/01/03/emerging-marke...
country                                                              US
domain_rank                                                       767.0
title                 Emerging markets are set for an even bigger ra...
performance_score                                                     0
site                                                           cnbc.com
participants_count                                                    0
title_full                                                             
spam_score                                                          0.0
site_type                                                       

In [16]:
df_thread.performance_score.value_counts()

performance_score
0     56933
1       341
2       151
10      100
3        89
4        60
5        41
6        30
8        20
7        20
9        17
Name: count, dtype: int64

In [17]:
df_thread.spam_score.value_counts()

spam_score
0.000    39798
0.001     1393
0.002      729
1.000      618
0.003      485
         ...  
0.754        2
0.549        1
0.674        1
0.395        1
0.843        1
Name: count, Length: 1000, dtype: int64

In [18]:
df_thread.site_type.value_counts()

site_type
news     55714
blogs     2088
Name: count, dtype: int64

In [19]:
df_thread.site.value_counts()

site
reuters.com    37262
cnbc.com       16019
wsj.com         3412
fortune.com     1109
Name: count, dtype: int64

In [20]:
df_thread.country.value_counts()

country
US    57802
Name: count, dtype: int64

### Entity summary

In [21]:
foo = [x.keys() for x in dic_all_entity.values()]

In [22]:
dic_all_entity[uuid]

{'persons': [{'name': 'jia', 'sentiment': 'none'},
  {'name': 'leshi', 'sentiment': 'none'},
  {'name': 'jia yueting', 'sentiment': 'none'}],
 'locations': [{'name': 'china', 'sentiment': 'none'},
  {'name': 'sunac china', 'sentiment': 'none'},
  {'name': 'leshi', 'sentiment': 'none'},
  {'name': 'hong kong', 'sentiment': 'none'},
  {'name': 'shenzhen', 'sentiment': 'none'}],
 'organizations': [{'name': 'leshi internet', 'sentiment': 'negative'},
  {'name': 'leeco', 'sentiment': 'negative'},
  {'name': 'reuters staff', 'sentiment': 'none'},
  {'name': 'leshi internet information & technology', 'sentiment': 'none'},
  {'name': 'reuters', 'sentiment': 'none'}]}