# Dependencies

In [37]:
# import libraries
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup
import requests
import time

import pandas as pd 
import numpy as np

# Extract

## Scrape IPOs

In [22]:
# IPO Scoop Upcoming IPOs
url = 'https://www.iposcoop.com/ipo-calendar/'
data = pd.read_html(url)

ipo_scoop_upcoming_df = data[0]

# rename symbol proposed with symbol
ipo_scoop_upcoming_df.rename(columns={'Symbol proposed':'Symbol'}, inplace=True)

# split expected trade date to date and day of week
ipo_scoop_upcoming_df[['Offer Date','Expected Trade Weekday']] = ipo_scoop_upcoming_df['Expected to Trade'].str.split(' ',expand=True)

# add date type column to differentiate confirmed vs expected
ipo_scoop_upcoming_df['Date Type'] = "Expected"

ipo_scoop_upcoming_df.head(2)

Unnamed: 0,Company,Symbol,Lead Managers,Shares (Millions),Price Low,Price High,Est. $ Volume,Expected to Trade,SCOOP Rating,Rating Change,Offer Date,Expected Trade Weekday,Date Type
0,Vertex,VERX,Goldman Sachs/ Morgan Stanley,21.2,14.0,16.0,$ 317.3 mil,7/29/2020 Wednesday,S/O,S/O,7/29/2020,Wednesday,Expected
1,ACE Convergence Acquisition,ACEV.U,Cantor,20.0,10.0,10.0,$ 200.0 mil,7/30/2020 Thursday,S/O,S/O,7/30/2020,Thursday,Expected


In [23]:
# IPO Scoop Upcoming IPOs - reduce to primary info
ipo_scoop_upcoming_df = ipo_scoop_upcoming_df[["Symbol", "Company", "Offer Date", "Date Type"]]
ipo_scoop_upcoming_df

Unnamed: 0,Symbol,Company,Offer Date,Date Type
0,VERX,Vertex,7/29/2020,Expected
1,ACEV.U,ACE Convergence Acquisition,7/30/2020,Expected
2,ALVR,AlloVir,7/30/2020,Expected
3,GOED,1847 Goedeker,7/31/2020,Expected
4,LI,Li Auto,7/31/2020,Expected
5,VSTA,Vasta Platform Limited,7/31/2020,Expected
6,VITL,"Vital Farms, Inc.",7/31/2020,Expected


In [24]:
# IPO Scoop Recent IPOs
url = 'https://www.iposcoop.com/last-100-ipos'
data = pd.read_html(url)

ipo_scoop_recent_df = data[0]

# add date type column to differentiate confirmed vs expected
ipo_scoop_recent_df['Date Type'] = "Confirmed"

ipo_scoop_recent_df

Unnamed: 0,Company,Symbol,Industry,Offer Date,Shares (millions),Offer Price,1st Day Close,Current Price,Return,SCOOP Rating,Date Type
0,Annexon,ANNX,Health Care,7/24/2020,14.8,$17.00,$17.76,$17.76,4.47%,S/O,Confirmed
1,Inozyme Pharma,INZY,Health Care,7/24/2020,7.0,$16.00,$17.54,$17.54,9.63%,S/O,Confirmed
2,iTeos Therapeutics,ITOS,Health Care,7/24/2020,10.6,$19.00,$19.05,$19.05,0.26%,S/O,Confirmed
3,Nurix Therapeutics,NRIX,Health Care,7/24/2020,11.0,$19.00,$19.01,$19.01,0.05%,S/O,Confirmed
4,Montrose Environmental Group,MEG,Consumer Services,7/23/2020,10.0,$15.00,$22.00,$22.39,49.27%,S/O,Confirmed
...,...,...,...,...,...,...,...,...,...,...,...
95,YayYo,YAYO,Technology,11/13/2019,2.6,$4.00,$3.55,$0.30,-92.50%,S/O,Confirmed
96,89bio,ETNB,Health Care,11/11/2019,5.3,$16.00,$20.80,$31.67,97.94%,S/O,Confirmed
97,36Kr Holdings,KRKR,Consumer Goods,11/8/2019,1.4,$14.50,$13.06,$3.47,-76.07%,S/O,Confirmed
98,CNS Pharmaceuticals,CNSP,Health Care,11/8/2019,2.1,$4.00,$4.57,$1.96,-51.00%,S/O,Confirmed


In [38]:
# IPO Scoop Recent IPOs - reduce to primary info
ipo_scoop_recent_df = ipo_scoop_recent_df[["Symbol", "Company", "Offer Date", "Date Type"]]
ipo_scoop_recent_df.head(2)

Unnamed: 0,Symbol,Company,Offer Date,Date Type
0,ANNX,Annexon,7/24/2020,Confirmed
1,INZY,Inozyme Pharma,7/24/2020,Confirmed


In [32]:
# combine IPO dataframes
ipo_df = pd.concat([ipo_scoop_recent_df, ipo_scoop_upcoming_df], ignore_index=True)
ipo_df.dtypes

Symbol        object
Company       object
Offer Date    object
Date Type     object
dtype: object

In [36]:
# convert offer date to datetime datatype
ipo_df['Offer Date'] = pd.to_datetime(ipo_df['Offer Date'], format="%m/%d/%Y")
ipo_df.sort_values(by='Offer Date', ascending=False)

Unnamed: 0,Symbol,Company,Offer Date,Date Type
106,VITL,"Vital Farms, Inc.",2020-07-31,Expected
105,VSTA,Vasta Platform Limited,2020-07-31,Expected
104,LI,Li Auto,2020-07-31,Expected
103,GOED,1847 Goedeker,2020-07-31,Expected
102,ALVR,AlloVir,2020-07-30,Expected
...,...,...,...,...
95,YAYO,YayYo,2019-11-13,Confirmed
96,ETNB,89bio,2019-11-11,Confirmed
97,KRKR,36Kr Holdings,2019-11-08,Confirmed
98,CNSP,CNS Pharmaceuticals,2019-11-08,Confirmed


__Additional notes__  
Ideally would have more sources for IPOs than just IPO Scoop.

Need to investigate how to find IPO information and how to pull.  Initially had challenges using pandas to pull tables directly from Nasdaq and NYSE--can investigate other methods for scraping.  Here's some commentary how to find:  https://www.investopedia.com/articles/investing/050115/how-track-upcoming-ipos.asp

May also be able to get from yahoo or google finance APIs.

In [None]:
# testing additional pulls that didn't work 
#url = 'https://www.marketwatch.com/tools/ipo-calendar'
#ipo_boutique_url = 'https://www.ipoboutique.com/ipo-calendar.html'
#ipo_nasdaq_url = "https://www.nasdaq.com/market-activity/ipos?tab=upcoming"
#data = pd.read_html(url)
#data[0]

## Stock Detail & Performance
May include thinks like:
1. Stock price
2. Market cap (may choose to exclude smaller cap new stocks to limit to biggest and more interesting IPOs)
3. Launch date open and close price
4. Stock attribute information (tech vs consumer goods vs any summary statement available?)

In [47]:
# set up as a sample for now, will need to determine when to scrape stock info and what to keep
sample_symbols = [ipo_df.iloc[20]['Symbol'],
                  ipo_df.iloc[25]['Symbol']
                 ]
sample_symbols

['LMND', 'EBON']

In [48]:
# sample loop through symbols

# empty list to hold data
stock_data = []

# loop through symbols and get data for each
for symbol in sample_symbols:
    r = requests.get(f'https://query2.finance.yahoo.com/v10/finance/quoteSummary/{symbol}?formatted=true&crumb=8ldhetOu7RJ&lang=en-US&region=US&modules=defaultKeyStatistics%2CfinancialData%2CcalendarEvents&corsDomain=finance.yahoo.com')
    data = r.json()
    stock_data.append(data)
    
print(stock_data)

[{'quoteSummary': {'result': [{'defaultKeyStatistics': {'maxAge': 1, 'priceHint': {'raw': 2, 'fmt': '2', 'longFmt': '2'}, 'enterpriseValue': {'raw': 1098542208, 'fmt': '1.1B', 'longFmt': '1,098,542,208'}, 'forwardPE': {}, 'profitMargins': {'raw': -1.4957601, 'fmt': '-149.58%'}, 'floatShares': {'raw': 4872266, 'fmt': '4.87M', 'longFmt': '4,872,266'}, 'sharesOutstanding': {'raw': 54896200, 'fmt': '54.9M', 'longFmt': '54,896,200'}, 'sharesShort': {}, 'sharesShortPriorMonth': {}, 'sharesShortPreviousMonthDate': {}, 'dateShortInterest': {}, 'sharesPercentSharesOut': {}, 'heldPercentInsiders': {}, 'heldPercentInstitutions': {}, 'shortRatio': {}, 'shortPercentOfFloat': {}, 'beta': {}, 'morningStarOverallRating': {}, 'morningStarRiskRating': {}, 'category': None, 'bookValue': {}, 'priceToBook': {}, 'annualReportExpenseRatio': {}, 'ytdReturn': {}, 'beta3Year': {}, 'totalAssets': {}, 'yield': {}, 'fundFamily': None, 'fundInceptionDate': {}, 'legalType': None, 'threeYearAverageReturn': {}, 'fiveY

__Additional Notes__  
Using APIs vs scraping is another option, get stock info on IPOs.  

Two potential APIs to try:

__Google__  
https://pypi.org/project/googlefinance/
 
__Yahoo__  
https://github.com/ranaroussi/yfinance  
https://towardsdatascience.com/free-stock-data-for-python-using-yahoo-finance-api-9dafd96cad2e  
https://rapidapi.com/apidojo/api/yahoo-finance1  

__Other__  
https://medium.com/@patrick.collins_58673/stock-api-landscape-5c6e054ee631  

__Quandl__ (what was used in school)  
https://blog.quandl.com/api-for-stock-data

In [None]:
# # yahoo finance example

# import yfinance as yf

# stock = yf.Ticker("GOOG")

# # get stock info
# print(stock.info)
# print("------------------------")

# # get financials (returns empty dataframe need to troublshoot) -- also didn't have recent stock like Lemonade (LMND) 
# print(stock.financials)

In [None]:
# # Yahoo test through Rapid API. Note limits is 500 per month and 10 requests per minute, 
# # maybe not best route as pro is $10 per month

# import requests

# url = "https://yahoo-finance15.p.rapidapi.com/api/yahoo/qu/quote/LMND"
# x-rapid-key = "XXXXXXXXXXXXXXXXXXXXXXX"

# headers = {
#     'x-rapidapi-host': "yahoo-finance15.p.rapidapi.com",
#     'x-rapidapi-key': x-rapid-key
#     }

# #### un-comment out below to run (commented so don't get too many API calls) ####
# #response = requests.request("GET", url, headers=headers)

# print(response.text)

In [None]:
# Google test, gets "forbidden URL error message"
#from googlefinance import getQuotes
#import json
#json.dumps(getQuotes('AAPL'), indent=2)

# Load

Get data tranformed into a format we'd want to load to a database (noSQL or SQL).  

In [None]:
# this is a sample dataframe with fake numbers, may expand columns but this are primary ones needed
stock_data = {"Ticker": ["AAPL", "LMND"],
              "Date": ["07012020", "07012020"],
              "Open_Price": [88, 38],
              "Close_Price": [89, 55],
              "Market_Cap" : [145000, 30000]
             }
sample_df = pd.DataFrame(stock_data)
sample_df

In [None]:
# Robin: here are values from IPO Scoop

ipo_df = ipo_scoop_ipo[0]
ipo_df.columns = ["company", "symbol_proposed","lead_managers","shares_mil","price_low","price_high","est_volume","expected_to_trade","scoop_rating","rating_change"]
ipo_df

In [None]:
import pymongo

# Mongo DB configuration
mg_usr = 'username'
mg_pwd = 'password'

client = pymongo.MongoClient(f"mongodb+srv://{mg_usr}:{mg_pwd}@cluster0-xcn4s.mongodb.net/test?retryWrites=true&w=majority")
db = client['upcoming_ipos']
collection = db['ipos']

In [None]:
# Convert dataframe to dictionary records
data_dict = ipo_df.to_dict("records")

In [None]:
# Add records
collection.insert_many(data_dict)

# Analysis

Information that may be interesting to share.  Examples include:
1. Timing of when it launches, how long its been, etc.
2. Price performance
    - Launch date open and close price (how they did on first day)
    - How did it do when its hit 1 month, 3 month, 6 month, 1 year milestone
3. Industry perormance
    - Did it outperform the S&P 
    - Did it outperform they sector (Ex: tech, consumer goods)
4. Top performers
    - Which IPOs did best in last 1 month, 3 month, 6 month, 1 year milestone

In [None]:
# sample analysis for open to close change for one day
appl = sample_df.loc[(sample_df["Ticker"] == "AAPL") & (sample_df["Date"] == "07012020")]
appl_day_change = appl["Close_Price"] / appl["Open_Price"] -1
print(appl_day_change)

In [None]:
# Calculate based on MongoDB record
documents = collection.find({})
response = []
for document in documents:
    try:
        document['_id'] = str(document['_id'])
        response.append(document)
    except:
        response.append(None)
        log.info(f'Could not find {document}')
        
# Example field reference for first record
price_high = response[0]["price_high"]
price_high

In [None]:
# Print text for each record

for i in range(0,len(response)):
  print(f"{response[i]['expected_to_trade']}: {response[i]['company']} [{response[i]['symbol_proposed']}]. Price (Low-High): ${response[i]['price_low']}-{response[i]['price_high']}. #new_ipo_{response[i]['symbol_proposed']}")  

In [None]:
# Establish Twitter connection

import tweepy

CONSUMER_KEY = "consumer_key"
CONSUMER_SECRET = "consumer_secret"   
ACCESS_KEY = "access_key"    
ACCESS_SECRET = "access_secret"

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)

api = tweepy.API(auth)

In [None]:
# Post a tweet for each record
for i in range(0,len(response)):
  new_tweet = f"{response[i]['expected_to_trade']}: {response[i]['company']} [{response[i]['symbol_proposed']}]. Price (Low-High): ${response[i]['price_low']}-{response[i]['price_high']}. #new_ipo_{response[i]['symbol_proposed']}" 
  api.update_status(new_tweet)      