# Download storyID from Eikon for certain Tickers

## Imports

In [None]:
import datetime
import pandas as pd
import time
import numpy as np
import dateutil
from tqdm import tqdm
import glob
import os

## Path

In [None]:
working_path="G:\\My Drive\\PhD\\Research\\Projects\\Portfolio Management and Sentiment views\\Data\\News\\Eikon\\test\\"

In [None]:
output_data_path = "G:\\My Drive\\PhD\\Research\\Projects\\Portfolio Management and Sentiment views\\Data\\News\\Eikon\\test"
#data_path = "C:\\Users\\cityu_local\\Downloads\\newsheadline_20190612_20200612_28_tickers"

## Eikon API

In [None]:
with open(working_path+'Eikon_Api_Key.txt', 'r') as f:
    cookie = f.read()

In [None]:
import eikon as ek
ek.set_app_key(cookie)

## Self-definded Functions

The following function transform Eikon RIC code into ISIN

In [None]:
def get_ISIN(ticker, verbose=False):
    if verbose:
        print("Getting ISIN code of {}".format(ticker))
    res = ek.get_symbology(ticker,
                           from_symbol_type='RIC',
                           to_symbol_type='ISIN')
    if 'error' in res.columns:
        ticker = "{}.?".format(ticker)
        res = ek.get_symbology(ticker,
                               from_symbol_type='RIC',
                               to_symbol_type='ISIN')
        if 'error' in res.columns:
            return ''

    ISIN_code = res.loc[ticker, 'ISIN']
    if verbose:
        print("ISIN Code of {}: {}".format({ticker}, {ISIN_code}))
    return ISIN_code

The following function transform ISIN code into Eikon RIC code

In [None]:
def get_RIC(ISIN_code, verbose = False):
    if verbose:
        print("Getting RIC code of {}".format(ISIN_code))
    res2 = ek.get_symbology(ISIN_code, 
                 from_symbol_type='ISIN', 
                 to_symbol_type='RIC')
    if 'error' in res2.columns:
        RIC_code = ''
    else:
        RIC_code = res2.loc[ISIN_code, 'RIC']
    if verbose:
        print("RIC Code of {}: {}".format({ticker}, {RIC_code}))
    return RIC_code

## Stock Tickers

Read the list of tickers from a file. Of course be sure you set the path correctly

In [None]:
tickers = pd.read_csv(working_path+"myTickerList-Eikon.csv",
                         encoding = 'utf-8',
                     index_col = 0).astype(str)
ticker_list = list(tickers['ticker'])
ticker_list

Alternatively hardcode the list here

In [None]:
ticker_list=['AAPL','GS','PFE','SBUX','NEM']
ticker_list

In [None]:
len(ticker_list)

# Crawling the storyID from Eikon with the API

Define how many months of news you want to collect starting from now

In [None]:
months_to_collect=12

In [None]:
now = datetime.datetime.now()
start_date = (now - dateutil.relativedelta.relativedelta(months = months_to_collect)).strftime("%Y-%m-%d")
end_date = now.strftime("%Y-%m-%d")

As it can be seen from the code below, the first date is the oldest. The code starts to collect from the oldest date to the newest

In [None]:
date_range = pd.date_range(start = start_date,
             end = end_date)
date_range

Read from a path the already downloaded storyId in order to determine the remaning days to collect, avoiding to start over again every time the code is run. Of course be sure you set the path correctly

In [None]:
crawled_date = [os.path.split(item)[-1].replace('.csv', '') for item in glob.glob(os.path.join(working_path+"storyID\\", '*.csv'))]
crawled_date

In [None]:
# parameters
requests_limit = 100
delta_hours = 24

# dayly loop
if not os.path.exists('newsheadline'):
    os.makedirs('newsheadline')
# crawled_date = [os.path.split(item)[-1].replace('.csv', '') for item in glob.glob(os.path.join('newsheadline', '*.csv'))]
for date in tqdm(date_range, desc='Daily Loop'):
    # GMT
    date_to = (date + datetime.timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%S")
    date_from = date.strftime("%Y-%m-%dT%H:%M:%S")
    date_from_str = date.strftime("%Y-%m-%d")
    if date_from_str not in crawled_date:
        print('Getting newsheadline for {}'.format(date_from_str))
        news_headlines = pd.DataFrame()
        
        # RIC Code
        for ticker in ticker_list:
            RIC_code = tickers['RIC'][tickers['ticker'] == ticker].values[0]
            print("####################################\nStart collecting ticker: {}".format(RIC_code))

            # reset the time parameters
            delta_hours_current = delta_hours
            frequence = str(delta_hours_current)+'H'
            periods = 24/delta_hours_current

            # API calls manager. Break downs time range in multiple requests, if requests_limit is hit
            while True:
                
                #reset requests_limit_hit to false. It isset to True every time a requests limit is hit
                requests_limit_hit=False
                
                #create a new data frame to collect the headlines for the current ticker
                news_headlines_ticker = pd.DataFrame()
                
                # generate the list of the data ranges of the requests
                print("TRY delta_hours: {}, frequence: {}, periods {}: ".format(delta_hours_current, frequence, periods))
                day_time = pd.date_range(start=date, periods=periods, freq=frequence)
                print('list of data ranges to try:\n', day_time)
                
                #loop over the list of data ranges, with frequency set by "frequence", for the current date 
                for day_time_i in day_time:

                    # define the data ranges for the current request
                    date_to_current_request = (day_time_i + datetime.timedelta(hours=delta_hours_current)).strftime("%Y-%m-%dT%H:%M:%S")
                    date_from_current_request = day_time_i.strftime("%Y-%m-%dT%H:%M:%S")
                    
                    print("current start from: {}, current to: {}".format(date_from_current_request, date_to_current_request))
                    tmp = ek.get_news_headlines("R:{} IN ENGLISH".format(RIC_code),
                                                date_from=date_from_current_request,
                                                date_to=date_to_current_request,
                                                count=requests_limit)
#                     tmp=pd.DataFrame(np.random.randint(0,100,size=(np.random.randint(80,105), 4)), columns=['A','B','C','ticker'])
                    tmp['ticker'] = ticker
                    print("received {} news for {}".format(tmp.shape[0], ticker))
            
                    #check if the API has received more headlines than the requests limit
                    if tmp.shape[0] >= requests_limit:
                        requests_limit_hit=True
                        print('!!!!!! REQUESTS LIMIT HIT !!!!!!')
                        # increase data range frequence of requests
                        delta_hours_current = delta_hours_current/2
                        frequence = str(delta_hours_current)+'H'
                        periods = 24/delta_hours_current
                        break  # exit the for loop, it is needed to increase the frequence because the limit has been hit     
                           
                    #append the request to the news headlines so far collected for the current ticker
                    #print("Add {} news for {} from {} to {}".format(tmp.shape[0], ticker, date_from_current_request, date_to_current_request))
                    news_headlines_ticker = pd.concat([news_headlines_ticker, tmp])
                    time.sleep(0.2)
                
                #if requests limit has been hit skip to the next while loop iteration
                if requests_limit_hit==True:
                    continue
                           
                #land here when the for loop finishes (without hit the break), it means all the requests have not hit the limit
                #all headlines collected for the current ticker without limits hit, the while loop can be break now
                #print('For loop on the data ranges for current ticker: finished without hit the requests limit')
                break

            #land here when the while loop finishes. Add the ticker's headlines to the date's headlines
            #print('While loop for current ticker: finished')
            print("Add {} news for {} in {}".format(news_headlines_ticker.shape[0], ticker, date_from))
            news_headlines = pd.concat([news_headlines, news_headlines_ticker])
            time.sleep(0.2)

        # write the date newsheadlines
        print('We have crawled {} newsheadline for {}'.format(news_headlines.shape[0], date_from_str))
        news_headlines.to_csv(os.path.join(output_data_path + '\\storyId\\',
                                  '{}.csv'.format(date_from_str)),encoding='utf-8')
        time.sleep(0.2)
    else:
        print("We already crawled newsheadline for {}".format(date_from_str))