# Download news raw text from Eikon for a list of storyID

This script loads the already downloaded Eikon news storyID, connect to Eikon API and request the news raw text for each of them

### Imports

In [1]:
import datetime
import pandas as pd
import time
import numpy as np
import dateutil
from tqdm import tqdm
import glob
import os
from IPython.display import HTML
from bs4 import BeautifulSoup
import pickle
import string
import random
import html5lib
import matplotlib
from matplotlib.ticker import PercentFormatter

### Paths setup

If working locally

In [2]:
data_path = "G:\\My Drive\\PhD\\Research\\Projects\\Portfolio Management and Sentiment views\\Data\\News\\Eikon\\"

If working at the Eikon's workstation

In [3]:
data_path = "C:\\Users\\cityu_local\\Downloads\\"

### Eikon API setup

In [4]:
with open('Eikon_Api_Key.txt', 'r') as f:
    cookie = f.read()

In [5]:
import eikon as ek
ek.set_app_key(cookie)

### Defining the tickers list to use

In [6]:
tickers = pd.read_csv("myTickerList-Eikon.csv",
                         encoding = 'utf-8',
                     index_col = 0).astype(str)
tickers

Unnamed: 0,ticker,company,exchange,market_cap,ISIN,RIC,selected
0,MSFT,Microsoft Corporation,NASDAQ,779000000000.0,US5949181045,MSFT.O,0
1,AAPL,Apple Inc,NASDAQ,744000000000.0,US0378331005,AAPL.O,1
2,AMZN,Amazoncom Inc,NASDAQ,714000000000.0,US0231351067,AMZN.O,1
3,GOOGL,Alphabet Inc,NASDAQ,712000000000.0,US02079K3059,GOOGL.O,1
4,FB,Facebook Inc,NASDAQ,383000000000.0,US30303M1027,FB.O,0
5,JNJ,Johnson & Johnson,NYSE,344000000000.0,US4781601046,JNJ,1
6,JPM,J P Morgan Chase & Co,NYSE,321000000000.0,US46625H1005,JPM,1
7,V,Visa Inc,NYSE,293000000000.0,US92826C8394,V,0
8,XOM,Exxon Mobil Corporation,NYSE,291000000000.0,US30231G1022,XOM,1
9,WMT,Walmart Inc,NYSE,254000000000.0,US9311421039,WMT,1


In [7]:
tickers = tickers[tickers['selected'] == '1']
tickers

Unnamed: 0,ticker,company,exchange,market_cap,ISIN,RIC,selected
1,AAPL,Apple Inc,NASDAQ,744000000000.0,US0378331005,AAPL.O,1
2,AMZN,Amazoncom Inc,NASDAQ,714000000000.0,US0231351067,AMZN.O,1
3,GOOGL,Alphabet Inc,NASDAQ,712000000000.0,US02079K3059,GOOGL.O,1
5,JNJ,Johnson & Johnson,NYSE,344000000000.0,US4781601046,JNJ,1
6,JPM,J P Morgan Chase & Co,NYSE,321000000000.0,US46625H1005,JPM,1
8,XOM,Exxon Mobil Corporation,NYSE,291000000000.0,US30231G1022,XOM,1
9,WMT,Walmart Inc,NYSE,254000000000.0,US9311421039,WMT,1
17,UNP,Union Pacific Corporation,NYSE,99230893092.0,US9078181081,UNP,1
19,LIN,Linde plc,NYSE,85982612516.0,IE00BZ12WP82,LIN,1
20,NEE,NextEra Energy Inc,NYSE,83597404902.0,US65339F1012,NEE,1


In [8]:
tickers.shape

(11, 7)

### Loading the storyID

In [9]:
storyId = pd.read_csv(data_path + "storyId\\storyId.csv", encoding = 'utf-8')
storyId

Unnamed: 0,index,versionCreated,text,storyId,sourceCode,ticker,date
0,2019-06-12 23:22:40.536,2019-06-12 23:22:40.536000+00:00,Quisitive Technology Solutions Acquires 100% S...,urn:newsml:reuters.com:20190612:nNRA8y1imv:1,NS:DATMTR,MSFT,2019-06-12 00:00:00.000
1,2019-06-12 23:13:47.773,2019-06-12 23:13:47.773000+00:00,Reuters Insider - Cramer: Here's how to spot t...,urn:newsml:reuters.com:20190612:nRTV1KKFwz:1,NS:CNBC,MSFT,2019-06-12 00:00:00.000
2,2019-06-12 22:35:50.594,2019-06-12 22:35:50.594000+00:00,Microsoft Corporation - Microsoft announces qu...,urn:newsml:reuters.com:20190612:nNDL1XwdmZ:1,NS:PUBT,MSFT,2019-06-12 00:00:00.000
3,2019-06-12 22:21:42.000,2019-06-12 22:21:42+00:00,BRIEF-Microsoft Announces Quarterly Dividend,urn:newsml:reuters.com:20190612:nFWN23J0UQ:2,NS:RTRS,MSFT,2019-06-12 00:00:00.000
4,2019-06-12 22:21:14.000,2019-06-12 22:21:16.580000+00:00,MICROSOFT CORP <MSFT.O> SETS QUARTERLY DIVIDEN...,urn:newsml:reuters.com:20190612:nFWN23J0UQ:2,NS:RTRS,MSFT,2019-06-12 00:00:00.000
...,...,...,...,...,...,...,...
164870,2020-06-01 06:00:10.000,2020-06-01 06:00:10+00:00,REG - GS ActiveBeta US - Net Asset Value(s),urn:newsml:reuters.com:20200601:nRSA4576Oa:1,NS:LSE,GS,2020-06-01 00:00:00.000
164871,2020-06-01 06:00:05.000,2020-06-01 06:00:05+00:00,REG - Goldman Sachs ETF - Net Asset Value(s),urn:newsml:reuters.com:20200601:nRSA4469Oa:1,NS:LSE,GS,2020-06-01 00:00:00.000
164872,2020-06-09 23:20:42.895,2020-06-10 07:17:27.709000+00:00,Dow Jones Selected Stocks - June 10,urn:newsml:newsroom:20200609:nNRAc0ncwa:0,NS:AAP,GS,2020-06-09 00:00:00.000
164873,2020-06-10 14:38:20.000,2020-06-10 14:52:16+00:00,BREAKINGVIEWS-SoftBank could end up making lem...,urn:newsml:reuters.com:20200610:nL8N2DN53P:1,NS:RTRS,GS,2020-06-10 00:00:00.000


In [10]:
storyId['index'] = pd.to_datetime(pd.to_datetime(storyId['index']))
storyId['versionCreated'] = pd.to_datetime(pd.to_datetime(storyId['versionCreated']))
storyId['date'] = pd.to_datetime(pd.to_datetime(storyId['date']).dt.date)
storyId

Unnamed: 0,index,versionCreated,text,storyId,sourceCode,ticker,date
0,2019-06-12 23:22:40.536,2019-06-12 23:22:40.536000+00:00,Quisitive Technology Solutions Acquires 100% S...,urn:newsml:reuters.com:20190612:nNRA8y1imv:1,NS:DATMTR,MSFT,2019-06-12
1,2019-06-12 23:13:47.773,2019-06-12 23:13:47.773000+00:00,Reuters Insider - Cramer: Here's how to spot t...,urn:newsml:reuters.com:20190612:nRTV1KKFwz:1,NS:CNBC,MSFT,2019-06-12
2,2019-06-12 22:35:50.594,2019-06-12 22:35:50.594000+00:00,Microsoft Corporation - Microsoft announces qu...,urn:newsml:reuters.com:20190612:nNDL1XwdmZ:1,NS:PUBT,MSFT,2019-06-12
3,2019-06-12 22:21:42.000,2019-06-12 22:21:42+00:00,BRIEF-Microsoft Announces Quarterly Dividend,urn:newsml:reuters.com:20190612:nFWN23J0UQ:2,NS:RTRS,MSFT,2019-06-12
4,2019-06-12 22:21:14.000,2019-06-12 22:21:16.580000+00:00,MICROSOFT CORP <MSFT.O> SETS QUARTERLY DIVIDEN...,urn:newsml:reuters.com:20190612:nFWN23J0UQ:2,NS:RTRS,MSFT,2019-06-12
...,...,...,...,...,...,...,...
164870,2020-06-01 06:00:10.000,2020-06-01 06:00:10+00:00,REG - GS ActiveBeta US - Net Asset Value(s),urn:newsml:reuters.com:20200601:nRSA4576Oa:1,NS:LSE,GS,2020-06-01
164871,2020-06-01 06:00:05.000,2020-06-01 06:00:05+00:00,REG - Goldman Sachs ETF - Net Asset Value(s),urn:newsml:reuters.com:20200601:nRSA4469Oa:1,NS:LSE,GS,2020-06-01
164872,2020-06-09 23:20:42.895,2020-06-10 07:17:27.709000+00:00,Dow Jones Selected Stocks - June 10,urn:newsml:newsroom:20200609:nNRAc0ncwa:0,NS:AAP,GS,2020-06-09
164873,2020-06-10 14:38:20.000,2020-06-10 14:52:16+00:00,BREAKINGVIEWS-SoftBank could end up making lem...,urn:newsml:reuters.com:20200610:nL8N2DN53P:1,NS:RTRS,GS,2020-06-10


In [11]:
storyId.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164875 entries, 0 to 164874
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype              
---  ------          --------------   -----              
 0   index           164875 non-null  datetime64[ns]     
 1   versionCreated  164875 non-null  datetime64[ns, UTC]
 2   text            164875 non-null  object             
 3   storyId         164875 non-null  object             
 4   sourceCode      164875 non-null  object             
 5   ticker          164875 non-null  object             
 6   date            164875 non-null  datetime64[ns]     
dtypes: datetime64[ns, UTC](1), datetime64[ns](2), object(4)
memory usage: 8.8+ MB


### Keeping only the storyId for tickers in the tickers list

In [12]:
storyId['ticker'].isin(list(tickers['ticker'])).value_counts()

False    88623
True     76252
Name: ticker, dtype: int64

In [13]:
storyId = storyId[storyId['ticker'].isin(list(tickers['ticker']))]
storyId.reset_index(drop=True, inplace=True)
storyId

Unnamed: 0,index,versionCreated,text,storyId,sourceCode,ticker,date
0,2019-06-12 23:00:11.983,2019-06-12 23:00:11.983000+00:00,Amazon Boosts Cashierless Initiatives With 2nd...,urn:newsml:reuters.com:20190612:nNRA8y19v4:1,NS:ZACKSC,AMZN,2019-06-12
1,2019-06-12 21:57:27.059,2019-06-12 21:57:27.059000+00:00,Reuters Insider - Facebook is sinking back int...,urn:newsml:reuters.com:20190612:nRTV1vZVFx:1,NS:CNBC,AMZN,2019-06-12
2,2019-06-12 21:32:15.868,2019-06-12 21:32:15.868000+00:00,Reuters Insider - The question is how will tar...,urn:newsml:reuters.com:20190612:nRTV1cdVkS:1,NS:CNBC,AMZN,2019-06-12
3,2019-06-12 21:27:02.000,2019-06-12 21:27:02+00:00,Republican senator criticizes potential dual U...,urn:newsml:reuters.com:20190612:nL2N23J1K6:3,NS:RTRS,AMZN,2019-06-12
4,2019-06-12 20:27:24.000,2019-06-12 20:27:24+00:00,Pompeo to push in India for more U.S. access t...,urn:newsml:reuters.com:20190612:nL2N23J1FG:5,NS:RTRS,AMZN,2019-06-12
...,...,...,...,...,...,...,...
76247,2020-02-18 04:21:35.000,2020-02-18 20:20:37+00:00,UPDATE 9-Oil near flat; virus impact offsets L...,urn:newsml:reuters.com:20200218:nL4N2AI139:16,NS:RTRS,AAPL,2020-02-18
76248,2020-04-30 07:20:46.999,2020-04-30 07:20:49.299000+00:00,Dow Jones Selected Stocks 1710 - April 30,urn:newsml:newsroom:20200430:nNRAbo9mnf:0,NS:AAP,AAPL,2020-04-30
76249,2020-04-30 07:03:12.228,2020-04-30 07:03:12.228000+00:00,"Apple grew by 78% in Q1 2020 in India, major c...",urn:newsml:reuters.com:20200430:nNRAbo9gve:1,NS:HINDUT,AAPL,2020-04-30
76250,2020-04-30 07:00:00.000,2020-04-30 07:00:00+00:00,"EXCLUSIVE-Finance, tech firms on hiring spree ...",urn:newsml:reuters.com:20200430:nL4N2CH2DP:1,NS:RTRS,AAPL,2020-04-30


In [14]:
storyId.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76252 entries, 0 to 76251
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   index           76252 non-null  datetime64[ns]     
 1   versionCreated  76252 non-null  datetime64[ns, UTC]
 2   text            76252 non-null  object             
 3   storyId         76252 non-null  object             
 4   sourceCode      76252 non-null  object             
 5   ticker          76252 non-null  object             
 6   date            76252 non-null  datetime64[ns]     
dtypes: datetime64[ns, UTC](1), datetime64[ns](2), object(4)
memory usage: 4.1+ MB


In [15]:
pd.concat([storyId.groupby(['ticker']).size(), storyId.groupby(['ticker']).size()/365], axis=1)

Unnamed: 0_level_0,0,1
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
AAPL,12289,33.668493
AMT,625,1.712329
AMZN,15754,43.161644
GOOGL,8821,24.167123
JNJ,4233,11.59726
JPM,20737,56.813699
LIN,1199,3.284932
NEE,1087,2.978082
UNP,985,2.69863
WMT,6020,16.493151


### Deleting the dates that do not overlap for the 5 tickers and 28 tickers samples

For the initial 5 tickers we collected the storyId starting from 2019-6-9, however for the remaining 28 we started from 2019-6-12. The end date for the initial 5 tickers was 2019-6-12, for the remaining 28 is 2020-6-16.<br>The overlaping dates therefore are from 2019-6-12 to 2020-6-12

In [16]:
start_date = datetime.datetime(2019, 6, 12)
end_date = datetime.datetime(2020, 6, 12)

In [17]:
storyId[(storyId['date'] < start_date)].shape[0]

85

In [18]:
storyId[(storyId['date'] > end_date)].shape[0]

386

In [19]:
storyId = storyId[(storyId['date'] <= end_date)]
storyId = storyId[(storyId['date'] >= start_date)]
storyId

Unnamed: 0,index,versionCreated,text,storyId,sourceCode,ticker,date
0,2019-06-12 23:00:11.983,2019-06-12 23:00:11.983000+00:00,Amazon Boosts Cashierless Initiatives With 2nd...,urn:newsml:reuters.com:20190612:nNRA8y19v4:1,NS:ZACKSC,AMZN,2019-06-12
1,2019-06-12 21:57:27.059,2019-06-12 21:57:27.059000+00:00,Reuters Insider - Facebook is sinking back int...,urn:newsml:reuters.com:20190612:nRTV1vZVFx:1,NS:CNBC,AMZN,2019-06-12
2,2019-06-12 21:32:15.868,2019-06-12 21:32:15.868000+00:00,Reuters Insider - The question is how will tar...,urn:newsml:reuters.com:20190612:nRTV1cdVkS:1,NS:CNBC,AMZN,2019-06-12
3,2019-06-12 21:27:02.000,2019-06-12 21:27:02+00:00,Republican senator criticizes potential dual U...,urn:newsml:reuters.com:20190612:nL2N23J1K6:3,NS:RTRS,AMZN,2019-06-12
4,2019-06-12 20:27:24.000,2019-06-12 20:27:24+00:00,Pompeo to push in India for more U.S. access t...,urn:newsml:reuters.com:20190612:nL2N23J1FG:5,NS:RTRS,AMZN,2019-06-12
...,...,...,...,...,...,...,...
76247,2020-02-18 04:21:35.000,2020-02-18 20:20:37+00:00,UPDATE 9-Oil near flat; virus impact offsets L...,urn:newsml:reuters.com:20200218:nL4N2AI139:16,NS:RTRS,AAPL,2020-02-18
76248,2020-04-30 07:20:46.999,2020-04-30 07:20:49.299000+00:00,Dow Jones Selected Stocks 1710 - April 30,urn:newsml:newsroom:20200430:nNRAbo9mnf:0,NS:AAP,AAPL,2020-04-30
76249,2020-04-30 07:03:12.228,2020-04-30 07:03:12.228000+00:00,"Apple grew by 78% in Q1 2020 in India, major c...",urn:newsml:reuters.com:20200430:nNRAbo9gve:1,NS:HINDUT,AAPL,2020-04-30
76250,2020-04-30 07:00:00.000,2020-04-30 07:00:00+00:00,"EXCLUSIVE-Finance, tech firms on hiring spree ...",urn:newsml:reuters.com:20200430:nL4N2CH2DP:1,NS:RTRS,AAPL,2020-04-30


In [20]:
storyId['date'].describe()

count                   75781
unique                    367
top       2019-12-03 00:00:00
freq                      794
first     2019-06-12 00:00:00
last      2020-06-12 00:00:00
Name: date, dtype: object

### Creating a list of dates to iter over

In [21]:
date_list = pd.DataFrame((pd.to_datetime(storyId['date'].dt.date.unique())))
date_list.columns = ['date']
date_list.sort_values(by=['date'])

Unnamed: 0,date
0,2019-06-12
1,2019-06-13
2,2019-06-14
3,2019-06-15
4,2019-06-16
...,...
362,2020-06-08
363,2020-06-09
364,2020-06-10
365,2020-06-11


2020 is leap year, plus 1 date (2020-6-12) which is the first of the next solar solar year

In [22]:
date_list = date_list[date_list['date'] > datetime.datetime(2020,2,2)]
date_list

Unnamed: 0,date
236,2020-02-03
237,2020-02-04
238,2020-02-05
239,2020-02-06
240,2020-02-07
...,...
362,2020-06-08
363,2020-06-09
364,2020-06-10
365,2020-06-11


In [23]:
date_list.describe()

Unnamed: 0,date
count,131
unique,131
top,2020-03-13 00:00:00
freq,1
first,2020-02-03 00:00:00
last,2020-06-12 00:00:00


### Creating date groups in storyId

In [24]:
date_group = storyId.groupby('date')

### Collecting the news raw text

In [None]:
j = 0
#iterate on the dates
for date in tqdm(date_list.itertuples(), desc = 'dates list'):
    #initialize the  data frame for the current iteration date
    news_text = pd.DataFrame(columns = ['storyId', 'story'])    
    #assign the current iteration date to a variable
    date = date.date.strftime('%Y-%m-%d')
    #assign the data frame that contains the current iteration date to a variable
    storyId_in_date = date_group.get_group(date)['storyId']
    print('########################## start collecting {} news for the date {}'.format(storyId_in_date.shape[0], date))     
    #iterate on the storyId for the current iteration date
    
    #check if API requests limit number has been hit
    if (j + storyId_in_date.shape[0]) > 10000:
        break
        print('hit limit of {} API requests'.format(j))
        
    for i, storyId in tqdm(enumerate(storyId_in_date), desc = 'storyID Loop'):
        #print('{} {}'.format(i, storyId))
        j += 1
        if 'html' in locals():
            del html
        try:
            html = ek.get_news_story(storyId)
        except:
            print('!!!!!!!!!!!!!!!!!!!!! story {} story-date {} story Id: {} NOT FOUND'.format(j, i, storyId))
            #append the current iteration story to news_text, write np.nan if the story is not found
            news_text = pd.concat([news_text, pd.DataFrame([{'storyId': storyId,'story': np.nan}])]) 
            continue
        #get the text part parsing the html   
        story = BeautifulSoup(html, 'html5lib').get_text() 
        print('story {} story-date {} story Id: {} story: {}'.format(j, i, storyId, story[0:50]))
        #append the current iteration story to news_text
        news_text = pd.concat([news_text, pd.DataFrame([{'storyId': storyId,'story': story}])])   
        
    #write on a file the stories collected for the current iteration date
    #pickle.dump(news_text, open(data_path + '\\story\\{}.pkl'.format(date), 'wb'))
    news_text.to_csv(data_path + 'story\\{}.csv'.format(date), header=False, index=False,encoding = 'utf-8')  