# IPO Investment via Text Mining
- Convert IPO S-1 filings into buy/sell signal with hold period [D1, W1, M1]

In [158]:
#core
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import glob
import nasdaq
from bs4 import BeautifulSoup
from pathlib import Path

#NLP
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.corpus import reuters
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

from gensim.summarization import summarize
from gensim.summarization import keywords

#ML
from sklearn import preprocessing
from sklearn import model_selection

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

In [159]:
# params
ipo_performance_csv = 'IPO Performance.csv'

# Load NASDAQ IPO Performace

In [160]:
#load and sort by date priced
df = pd.read_csv(ipo_performance_csv, index_col='Symbol')
df['Date Priced'] = pd.to_datetime(df['Date Priced'], format='%Y-%m-%d')
df.sort_values(by='Date Priced', inplace=True)

### Preprocessing

In [161]:
#transform company name 
df.insert(0, 'Name Length', df['Company Name'].str.len())
df.insert(0, 'Name Words', df['Company Name'].str.split(' ').map(lambda x: len(x)))
df.drop(['Company Name'], axis=1, inplace=True)

#add quarter
df.insert(0, 'Q', df['Date Priced'].map(lambda x: pd.Period(x,'Q').quarter))

#add month
df.insert(0, 'Month', df['Date Priced'].dt.month)

df.drop(['Date Priced'], axis=1, inplace=True)

In [162]:
#encode market
le = preprocessing.LabelEncoder()
df['Market'] = le.fit_transform(df['Market'])

In [163]:
#to one hot encoding
df = pd.concat([pd.get_dummies(df['Market']).add_prefix('MKT'), df], axis=1)
df = pd.concat([pd.get_dummies(df['Month']).add_prefix('M'), df], axis=1)
df = pd.concat([pd.get_dummies(df['Q']).add_prefix('Q'), df], axis=1)
df.drop(['Market', 'Month', 'Q'], axis=1, inplace=True)

In [164]:
#standardize
def standardize(df):
    return (df-df.mean())/df.std()

cols_to_standardize = ['Name Words', 'Name Length', 'Offer Amount', 'Price', 'Shares']
df[cols_to_standardize] = standardize(df[cols_to_standardize])

In [165]:
df.head(3)

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,M1,M2,M3,M4,M5,M6,...,MKT12,Name Words,Name Length,Offer Amount,Price,Shares,1D,1W,1M,3M
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
WQNI,1,0,0,0,0,1,0,0,0,0,...,0,-1.183553,-1.6113,-0.284726,-0.340497,-0.386568,-0.129856,-0.296122,0.088818,-0.667432
BBGI,1,0,0,0,0,1,0,0,0,0,...,0,0.950208,0.701675,-0.206002,-0.038392,-0.261148,-0.058333,-0.133333,-0.175,-0.25
UTSI,1,0,0,0,0,0,1,0,0,0,...,0,-0.116672,0.31618,-0.123477,0.263712,-0.164788,0.512195,0.448171,1.042683,-0.134146


# Baseline

- No NLP or fancy models
- Raw IPO listing data

In [166]:
def run_ml_flow(df):
    '''Runs Machine Learning flow, returns evaluation DataFrame'''
    
    targets = ['1D', '1W', '1M', '3M']
    evaluation = pd.DataFrame(columns=['AUC', 'f1', 'log loss'])

    for target in targets:

        #split
        X_train, X_test, y_train, y_test = model_selection.train_test_split(df.values[:,:-4], df[target].map(lambda x: 1 if x > 0 else 0).values, test_size=0.2, shuffle=False)
        #print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

        #classifiers
        clfs = {
            'RF' : RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1),
            'LR' : LogisticRegression(random_state=1),
            #'Vote' : VotingClassifier(estimators=[('lr', LogisticRegression(random_state=1)), ('rf', RandomForestClassifier(n_estimators=50, max_depth=5, random_state=1))], voting='soft')
        }

        #fit
        for k, clf in clfs.items():
            clf.fit(X_train, y_train)

        #evaluate
        for k, clf in clfs.items():
            #print(k)
            predictions = clf.predict(X_test)
            probas = clf.predict_proba(X_test)

            auc = roc_auc_score(y_test, predictions)
            f1 = f1_score(y_test, predictions)
            ll = log_loss(y_test, probas)
            #print('AUC:', auc)
            #print('f1:', f1)
            #print('log loss:', ll)
            #print(confusion_matrix(y_test, predictions))
            #print('\n')

        #save
        evaluation.loc[target] = [auc, f1, ll]
    
    return evaluation.T

In [167]:
#as expected the results are poor
run_ml_flow(df)

Unnamed: 0,1D,1W,1M,3M
AUC,0.514154,0.468991,0.482075,0.483895
f1,0.417722,0.541284,0.635294,0.567568
log loss,0.697682,0.713322,0.719962,0.723912


# Integrating IPO Raw Data

- First attempt to add raw IPO filings data
- The art of Feature Enginnering

In [168]:
def get_sentiment_df(text):
    '''returns Sentiment Analysis'''
    sid = SentimentIntensityAnalyzer()
    ps_list = []
    sentences = sent_tokenize(text)
    
    for sentence in sentences:
        ps = sid.polarity_scores(sentence)
        ps['sent'] = sentence
        ps_list.append(ps)
    
    df = pd.DataFrame(ps_list)

    return df

In [169]:
def add_sentiment_features(df_sentiment, symbol):
    '''Adds Sentiment Snalysis features, returns DataFrame'''
    
    #filter frames
    df_tmp = pd.DataFrame()
    pos = df_sentiment[df_sentiment['pos'] > df_sentiment['pos'].quantile(0.95)]
    neg = df_sentiment[df_sentiment['neg'] > df_sentiment['neg'].quantile(0.95)]

    #sentences features
    df_tmp.at[symbol, 'Mean Sent Len'] = df_sentiment['sent'].str.len().mean()
    df_tmp.at[symbol, 'Sent Count'] = df_sentiment.shape[0]

    #pos sentiment
    df_tmp.at[symbol, 'Pos Mean Sent Len'] = pos['sent'].str.len().mean()
    df_tmp.at[symbol, 'Pos Sent Count'] = pos.shape[0]
    df_tmp = pd.concat([pd.DataFrame(pos.mean(), columns=[symbol]).T.add_prefix('Pos Sent Signal '), df_tmp], axis=1)

    #pos sentiment
    df_tmp.at[symbol, 'Neg Mean Sent Len'] = neg['sent'].str.len().mean()
    df_tmp.at[symbol, 'Neg Sent Count'] = neg.shape[0]
    df_tmp = pd.concat([pd.DataFrame(neg.mean(), columns=[symbol]).T.add_prefix('Neg Sent Signal '), df_tmp], axis=1)
    
    return df_tmp

In [None]:
#add sentiment features
df_sentiment_features = pd.DataFrame()
counter = 0    

for x in df.index:
    try:
        counter += 1
        print('\n( ' + str(counter) + ' / ' + str(df.shape[0]) + ' ) ' + x)

        #check if raw data is available
        file_name = "./Data/" + x + ".htm"
        if Path(file_name).is_file():
            #load raw IPO filing
            with open(file_name, "r", encoding="utf-8") as file:
                html = file.read()
                soup = BeautifulSoup(html,"html5lib")
                text = soup.get_text(strip=True)

                #get sentiment
                df_sentiment = get_sentiment_df(text)
                df_sentiment_features = pd.concat([df_sentiment_features, add_sentiment_features(df_sentiment, x)], axis=0)
        else:
            print('no S-1 for ', x)
    except Exception as e:
        print(x, e)


( 1 / 951 ) WQNI
no S-1 for  WQNI

( 2 / 951 ) BBGI

( 3 / 951 ) UTSI
no S-1 for  UTSI

( 4 / 951 ) SLAB
no S-1 for  SLAB

( 5 / 951 ) WBSN
no S-1 for  WBSN

( 6 / 951 ) ALTH
no S-1 for  ALTH

( 7 / 951 ) MET
no S-1 for  MET

( 8 / 951 ) LPSN
no S-1 for  LPSN

( 9 / 951 ) HSTM
no S-1 for  HSTM

( 10 / 951 ) PXLW
no S-1 for  PXLW

( 11 / 951 ) CYH
no S-1 for  CYH

( 12 / 951 ) QBAK
no S-1 for  QBAK

( 13 / 951 ) CRL
no S-1 for  CRL

( 14 / 951 ) ACLS
no S-1 for  ACLS

( 15 / 951 ) SOHU
no S-1 for  SOHU

( 16 / 951 ) SRTI
no S-1 for  SRTI

( 17 / 951 ) PTIE
no S-1 for  PTIE

( 18 / 951 ) SPRT
no S-1 for  SPRT

( 19 / 951 ) SMTX

( 20 / 951 ) ARNA
no S-1 for  ARNA

( 21 / 951 ) ILMN
no S-1 for  ILMN

( 22 / 951 ) CAMT
no S-1 for  CAMT

( 23 / 951 ) EVC

( 24 / 951 ) DGEN
no S-1 for  DGEN

( 25 / 951 ) ERMS
no S-1 for  ERMS

( 26 / 951 ) LTRX
no S-1 for  LTRX

( 27 / 951 ) MDCO
no S-1 for  MDCO

( 28 / 951 ) EQIX
no S-1 for  EQIX

( 29 / 951 ) DRRX
no S-1 for  DRRX

( 30 / 951 ) MON

( 31

In [None]:
df1 = pd.concat([standardize(df_sentiment_features), df], axis=1).dropna()

In [None]:
#now run ML flow with fancy features
run_ml_flow(df1)

### Load Raw IPO

In [267]:
ipo = {}

for x in glob.glob("./Data/*.htm")[:5]:
    with open(x, "r", encoding="utf-8") as file:
        html = file.read()
        soup = BeautifulSoup(html,"html5lib")
        text = soup.get_text(strip=True)
        ipo[x.split('\\')[1].split('.')[0]] = text

In [268]:
ipo.keys()

dict_keys(['AAC', 'AACC', 'AACQU', 'AAHC', 'AAOI'])

# Summarization

In [9]:
import requests

#text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text
text = requests.get('https://www.sec.gov/Archives/edgar/data/1639920/000119312518063434/d494294df1.htm').text

print('Summary:')
print(summarize(text, ratio=0.01))

print('\nKeywords:')
print(keywords(text, ratio=0.01))

Summary:
In addition, we have issued ten beneficiary certificates per ordinary share held of record (excluding warrants, options, and RSUs, as applicable) to entities beneficially owned by our founders, Daniel Ek and Martin
Based on information available to us, the low and high sales price per ordinary share for such private transactions during the year ended December&nbsp;31, 2017 was $37.50 and $125.00, respectively, and during the period from January&nbsp;1, 2018
</P> <P STYLE="margin-top:6pt; margin-bottom:0pt; text-indent:4%; font-size:10pt; font-family:Times New Roman">Based on information provided by the NYSE, the opening public price of our ordinary shares on the
Based on such orders, the designated market maker will determine an opening price for our ordinary shares in consultation with a financial advisor pursuant to
securities, long term investment, Convertible Notes (as defined herein), and derivative financial instruments, which have been measured at fair value.
<P STYLE="

roman
including
include
included
includes
nbsp
font
users
user
certain
tax
taxes
taxing
taxed
shares
share
shared
sharing
company
companies
market
marketing
markets
marketers
marketability
musical
music
new
news
additional
addition
additions
provided
provide
providers
provides
providing
provider
spotify
advertising
advertisers
advertisements
advertisement
advertiser
data
service
services
serviced
generate
general
generated
generally
generating
generates
contents
content
based
base
bases
required
requires
requirements
require
requiring
requirement
rates
rate
rating
group
groups
grouped
financial
business
businesses


In [125]:
print(keywords(ipo['DBX'], ratio=0.01))

TypeError: decoding to str: need a bytes-like object, list found