In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
from pandas.io.json import json_normalize
import urllib.request as urlreq
import json
import time, sys
from IPython.display import clear_output
import datetime as dt
from yahoofinancials import YahooFinancials
import os

def update_progress(progress, date_str):
    bar_length = 90
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}% Status: {2}".format( "#" * block + "-" * (bar_length - block), 
                                                      progress * 100,
                                                      date_str)
    print(text)


def scrape_table_helper(earnings_soup):

    headers = [th.text for th in earnings_soup.find('tr', {'data-reactid': '19'}).find_all('th')]
    curr_dict = {header:[] for header in headers}

    for row in earnings_soup.find('tbody').find_all('tr'):
        curr_row_text = [td.text for td in row.find_all('td')]

        for i, val in enumerate(curr_row_text):
            curr_dict[headers[i]].append(val)

    earnings_df = pd.DataFrame(curr_dict)
    
    return earnings_df

def scrape_table(earnings_url):

    earnings_soup = bs(requests.get(earnings_url.format(dt, 0)).text, 'lxml')

    earnings_soup = earnings_soup.find("div", {"id": "fin-cal-table"})

    total_names = earnings_soup.find('span', {'data-reactid':'8'}).text.replace('results','').strip().split(' ')[-1]

    if total_names.isnumeric():
        total_names = int(total_names)
    else:
        return False

    num_of_offsets = total_names//100
    offset_range = [offset*100 for offset in list(range(1, 1 + num_of_offsets))]

    earnings_dfs = [scrape_table_helper(earnings_soup)]

    for offset in offset_range:
        offset_url = earnings_url + '&offset={}&size=100'.format(offset)
        earnings_soup = bs(requests.get(offset_url).text, 'lxml')
        earnings_soup = earnings_soup.find("div", {"id": "fin-cal-table"})
        earnings_dfs.append(scrape_table_helper(earnings_soup))

    earnings_df = pd.concat(earnings_dfs, axis = 0).drop_duplicates().reset_index(drop = True)
    
    return earnings_df

def ticker_earnings(ticker_check, curr_prices):

    curr_prices['formatted_date'] = pd.to_datetime(curr_prices['formatted_date'])

    curr_earnings = all_earnings_df[all_earnings_df.Symbol == ticker_check].reset_index(drop = True)
    curr_earnings['Price_Before'] = np.nan
    curr_earnings['Price_After'] = np.nan
    curr_earnings['Earnings_Change'] = np.nan

    for idx, row in curr_earnings.iterrows():
        earnings_date = row['Earnings Date']
        price_before = curr_prices[curr_prices.formatted_date <= earnings_date].sort_values('formatted_date', ascending = False).head(2).reset_index(drop = True).loc[1,'adjclose']
        price_after = curr_prices[curr_prices.formatted_date >= earnings_date].sort_values('formatted_date').head(2).reset_index(drop = True).loc[1, 'adjclose']
        curr_earnings.loc[idx, 'Price_Before'] = price_before
        curr_earnings.loc[idx, 'Price_After'] = price_after
        curr_earnings.loc[idx, 'Earnings_Change'] = price_after/price_before - 1

    return curr_earnings

In [2]:
start_date = dt.datetime(2017,1,1)
end_date = dt.datetime.today()

date_list = [day.strftime('%Y-%m-%d') for day in list(filter(lambda day: day.weekday() not in [5, 6], 
                                                             [start_date + dt.timedelta(day) for day in range((end_date - start_date).days + 1)]))]

In [3]:
os.chdir(r'C:\Users\Fang\Desktop\Python Trading\Trading\Data\Historical Queries\Yahoo Earnings Calendar')

if 'hist_earnings.csv' in os.listdir():
    all_earnings_df = pd.read_csv('hist_earnings.csv', parse_dates = ['Earnings Date'],
                                 index_col = 0)
else:
    base_earnings_url = 'https://finance.yahoo.com/calendar/earnings?day='

    all_earnings = []

    for i, date_str in enumerate(date_list):
        curr_url = base_earnings_url + date_str
        
        scrape_failed = True
        
        while scrape_failed:
            try:
                curr_df = scrape_table(curr_url)
                if type(curr_df) != bool:
                    curr_df['Earnings Date'] = dt.datetime.strptime(date_str, '%Y-%m-%d')
                    all_earnings.append(curr_df)
                scrape_failed = False
            except:
                time.sleep(0.5)

        update_progress(i / len(date_list), date_str)

    update_progress(1, date_str)

    all_earnings_df = pd.concat(all_earnings, axis = 0)
    
    all_earnings_df.to_csv('hist_earnings.csv')

In [4]:
batch_names = all_earnings_df[['Symbol','Earnings Date']].groupby('Symbol').min().sort_values('Earnings Date').reset_index()

batch_partions = range(len(batch_names))

batch_n = 50

batch_partitions = [batch_partions[i*batch_n: (i + 1)*batch_n] for i in range(len(batch_partions)//batch_n + 1)]

batch_partitions = [batch_names.iloc[batch_range,:] for batch_range in batch_partitions]

In [13]:
failed_batches = []

price_dict = {}

earnings_dict = []

for i, batch in enumerate(batch_partitions[::-1]):
    start_time = time.time()
    query_completed = False
    
    tick_batch = batch.Symbol.tolist()
    
    hist_start_date = batch['Earnings Date'].min() - dt.timedelta(days = 3)
    
    try:
        yahoo_financials = YahooFinancials(tick_batch)
        historical_stock_prices = yahoo_financials.get_historical_price_data(hist_start_date.strftime('%Y-%m-%d'), 
                                                                             dt.datetime.today().strftime('%Y-%m-%d'),
                                                                             'daily')
        query_completed = True
    except:
        failed_batches.append(tick_batch)
        continue
    
    if query_completed:
        
        for ticker in historical_stock_prices.keys():
            try:
                price_dict[ticker] = pd.DataFrame(historical_stock_prices[ticker]['prices'])

                curr_prices = price_dict[ticker]

                earnings_dict.append(ticker_earnings(ticker, curr_prices))
            except:
                continue
    end_time = time.time()
    update_progress(i / len(batch_partitions), str(round((end_time - start_time)/60, 2)))

Progress: [#########################################################################################-] 99.1% Status: 1.368561085065206


In [14]:
earnings_hist = pd.concat(earnings_dict, axis = 0)
earnings_hist.to_csv('earnings_results.csv')

In [16]:
print(len(failed_batches))

0


In [5]:
yahoo_financials = YahooFinancials(['TSLA','NVDA','AMD'])

In [7]:
earnings_data_json = yahoo_financials.get_stock_earnings_data()

In [10]:
earnings_data_json['TSLA']['earningsData']

{'quarterly': [{'date': '4Q2018', 'actual': 1.93, 'estimate': 2.2},
  {'date': '1Q2019', 'actual': -2.9, 'estimate': -0.69},
  {'date': '2Q2019', 'actual': -1.12, 'estimate': -0.36},
  {'date': '3Q2019', 'actual': 1.86, 'estimate': -0.42}],
 'currentQuarterEstimate': 1.29,
 'currentQuarterEstimateDate': '4Q',
 'currentQuarterEstimateYear': 2019,
 'earningsDate': [{'raw': 1580169600, 'fmt': '2020-01-28'},
  {'raw': 1580688000, 'fmt': '2020-02-03'}]}

In [None]:
pd.concat(earnings_dict, axis = 0).to_csv('test_earnings.csv')

In [21]:
ticker = 'NVDA'

modules = '%2C'.join(['assetProfile','incomeStatementHistory', 'balanceSheetHistoryQuarterly',
                              'balanceSheetHistory','cashflowStatementHistory', 'cashflowStatementHistoryQuarterly',
                              'defaultKeyStatistics','financialData','incomeStatementHistoryQuarterly',
                              'calendarEvents','secFilings', 'recommendationTrend', 'institutionOwnership',
                              'fundOwnership', 'majorDirectHolders', 'majorHoldersBreakdown',
                              'insiderTransactions', 'insiderHolders', 'netSharePurchaseActivity',
                              'earnings', 'earningsHistory', 'earningsTrend', 'industryTrend', 'indexTrend',
                              'sectorTrend'])
full_info_url = 'https://query1.finance.yahoo.com/v10/finance/quoteSummary/{0}?modules={1}'.format(ticker, modules)

full_info_json = json.loads(requests.get(full_info_url).text)
profile_json = full_info_json['quoteSummary']['result'][0]['assetProfile']



In [24]:
profile_json['industry']
profile_json['sector']

'Technology'