In [1]:
import pip
import time
import os
import numpy as np
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
import re
import requests
import urllib

from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import sys
from operator import itemgetter 
from selenium.webdriver import ActionChains
    
try:
    from selenium import webdriver
except ImportError:
    pip.main(['install', 'selenium'])
    from selenium import webdriver

from IPython.display import clear_output
pd.options.display.float_format = '{:,.0f}'.format

# Progress bar function unrelated to web scraping
def update_progress(progress, desc_string):
    bar_length = 60
    
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    # clear()
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}% Last Post: {2}".format( "#" * block + "-" * (bar_length - block), 
                                                                   progress * 100,
                                                                   desc_string)
    print(text)

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
prefs = {"download.default_directory" : r"C:\Users\Fang\Desktop\Python Trading\Trading\Data\Stock Universe"}
options.add_experimental_option("prefs",prefs)



db_dir = r'C:\Users\Fang\Desktop\Python Trading\Trading\Data\DBs'
chrome_dir = r'C:\Users\Fang\Desktop\Python Trading\Trading\Data\Stock Universe'
universe_dir = r'C:\Users\Fang\Desktop\Python Trading\Trading\Data\Stock Universe'

## Getting Universe of Tickers

In [108]:
# Function for logging into EODDATA
def eod_index_login(eod_index_user, eod_index_pwd, browser):
    
    os.chdir(chrome_dir)
    
    index_data_base_url = 'http://eoddata.com/'
    
    browser.get(index_data_base_url)
    browser.refresh()
    browser.delete_all_cookies()
    
    def login():
        email_input = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//input[@name="ctl00$cph1$lg1$txtEmail"]')))
        pwd_input = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//input[@name="ctl00$cph1$lg1$txtPassword"]')))
        submit_login = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//input[@value="Login"]')))

        email_input.click()
        email_input.send_keys(Keys.CONTROL, 'a')
        email_input.send_keys(eod_index_user)

        pwd_input.click()
        pwd_input.send_keys(Keys.CONTROL, 'a')
        pwd_input.send_keys(eod_index_pwd)

        submit_login.click()
        
        return
    
    try:
        login()
    except:
        browser.get(index_data_base_url)
        browser.refresh()
        browser.delete_all_cookies()
        
        login()

# Function for getting latest EODDATA constituents for AMEX, NYSE, and NASDAQ and writing to index.db
def update_index_constituents(browser, index_db_name):

    sites = {'AMEX':'http://eoddata.com/stocklist/AMEX.htm',
             'NYSE':'http://eoddata.com/stocklist/NYSE.htm',
             'NASDAQ':'http://eoddata.com/stocklist/NASDAQ.htm'}

    for idx, site in sites.items():
        browser.get(site)
        constituents = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//div[@class="hlink"]')))
        constituents.click()
    
    os.chdir(universe_dir)
    for index_txt in list(filter(lambda x: 'txt' == x.split('.')[-1], os.listdir())):
        os.chdir(universe_dir)
        curr_index_cons = pd.read_table(index_txt)
        idx = index_txt.strip('.txt')
        curr_index_cons['Index'] = idx
        
        os.chdir(db_dir)
        index_engine = create_engine('sqlite:///{}.db'.format(index_db_name), echo=False)

        try:
            prior_index_cons = pd.read_sql('SELECT * FROM tickers',
                                           con = index_engine)
        except:
            prior_index_cons = pd.DataFrame()


        prior_index_cons.append(curr_index_cons).drop_duplicates().to_sql('tickers',
                                                                          con = index_engine,
                                                                          index = False,
                                                                          if_exists = 'replace')
    
    os.chdir(universe_dir)
    for index_txt in list(filter(lambda x: 'txt' == x.split('.')[-1], os.listdir())):
        os.remove(index_txt)
    return

# YCharts Buyback Data Pull
def get_ychart_data(browser, ticker, info_field):
    
    ychart_link_dict = {'BuyBacks':'https://ycharts.com/companies/{ticker}/stock_buyback'.format(ticker = ticker),
                        'GrossMargin':'https://ycharts.com/companies/{ticker}/gross_profit_margin'.format(ticker = ticker),
                        'NetMargin':'https://ycharts.com/companies/{ticker}/profit_margin'.format(ticker = ticker),
                        'SharesOutstanding':'https://ycharts.com/companies/{ticker}/shares_outstanding'.format(ticker = ticker),
                        'DebtToEquity':'https://ycharts.com/companies/{ticker}/debt_equity_ratio'.format(ticker = ticker),
                        'CashOnHand':'https://ycharts.com/companies/{ticker}/cash_on_hand'.format(ticker = ticker),
                        'MktCap':'https://ycharts.com/companies/{ticker}/market_cap'.format(ticker = ticker),
                        'EnterpriseVal':'https://ycharts.com/companies/{ticker}/enterprise_value'.format(ticker = ticker),
                        'FCF':'https://ycharts.com/companies/{ticker}/free_cash_flow'.format(ticker = ticker)}

    browser.get(ychart_link_dict[info_field])
    browser.delete_all_cookies()

    ychart_data = WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//div[@id="dataTableBox"]')))
    ychart_data = ychart_data.find_elements_by_tag_name('table')

    if type(ychart_data) == list:

        ychart_df = pd.DataFrame()
        for table in ychart_data:
            curr_df = pd.read_html(table.get_attribute('outerHTML'))[0].iloc[1:,:]
            curr_df.columns = ['Qtr',info_field]
            curr_df.Qtr = pd.to_datetime(curr_df.Qtr)

            ychart_df = ychart_df.append(curr_df).reset_index(drop = True)

        ychart_df['Ticker'] = ticker
        
    return ychart_df

In [113]:
browser = webdriver.Chrome(executable_path = chrome_dir + "\\chromedriver.exe", options=options)

In [25]:
eod_index_email = 'tunihamupi@idx4.com'
eod_index_user = 'nahh2018'
eod_index_pwd = 'data.2018'
index_db_name = 'index'
buyback_db_name = 'buybacks'

eod_index_login(eod_index_user, eod_index_pwd, browser)
update_index_constituents(browser, index_db_name)

## Getting Share Buyback Data From YCharts

In [105]:
os.chdir(db_dir)

index_engine = create_engine('sqlite:///{}.db'.format(index_db_name), echo=False)
tickers = pd.read_sql('SELECT DISTINCT Symbol FROM tickers',
                      con = index_engine)

key_stats = ['BuyBacks', 'GrossMargin',
             'NetMargin',
             'SharesOutstanding',
             'DebtToEquity',
             'CashOnHand', 'MktCap','EnterpriseVal', 'FCF']

In [None]:
buyback_engine = create_engine('sqlite:///{}.db'.format(buyback_db_name), echo=False)

start_time = time.time()

for idx, row in tickers.iloc[55:,:].iterrows():
    ticker = row.Symbol
    
    ychart_url = 'https://ycharts.com/companies/{ticker}/stock_buyback'.format(ticker = ticker)
    
    try:
        browser.refresh()
    except:
        browser = webdriver.Chrome(executable_path = chrome_dir + "\\chromedriver.exe", options=options)
        
    browser.get(ychart_url)
    browser.delete_all_cookies()
    
    try:
        page_not_found = WebDriverWait(browser, 1).until(EC.presence_of_element_located((By.XPATH, '//h1[text()="Page Not Found"]')))
        
        run_time = round(time.time() - start_time, 2)
        update_progress(idx/len(tickers.Symbol), '{0} Seconds {1} '.format(run_time, ticker + ' No Data'))
        
        pd.DataFrame([ticker], columns = ['NoData']).to_sql('nodata',
                                                            con = buyback_engine,
                                                            index = False,
                                                            if_exists = 'append')
        continue
    except:
        
        for stat in key_stats:
            try:
                curr_df = get_ychart_data(browser, ticker, stat)
            except:
                browser = webdriver.Chrome(executable_path = chrome_dir + "\\chromedriver.exe", options=options)
                time.sleep(2)
                curr_df = get_ychart_data(browser, ticker, stat) 
            curr_df.to_sql(stat,
                           con = buyback_engine,
                           index = False,
                           if_exists = 'append')
        
    run_time = round(time.time() - start_time, 2)
    update_progress(idx/len(tickers.Symbol), '{0} Seconds {1} '.format(run_time, ticker))
    

Progress: [#-----------------------------------------------------------] 2.0% Last Post: 619.08 Seconds CCOR No Data 
