# Sentiment Analysis
- First attempt to add raw IPO filings data
- The art of Feature Enginnering
- Add Sentimnet Analysis features
 - Tokenize sentences
 - Use positive and negative sentences at 95th percentile means

In [6]:
#core
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd

import glob
import nasdaq
import ml
from bs4 import BeautifulSoup
from pathlib import Path

#NLP
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.corpus import reuters
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

from gensim.summarization import summarize
from gensim.summarization import keywords

# Load Baseline

In [7]:
#load and sort by date priced
df = pd.read_csv('1 baseline.csv', index_col='Symbol')

# Integrating IPO Raw Data

In [8]:
def get_sentiment_df(text):
    '''returns Sentiment Analysis'''
    sid = SentimentIntensityAnalyzer()
    ps_list = []
    sentences = sent_tokenize(text)
    
    for sentence in sentences:
        ps = sid.polarity_scores(sentence)
        ps['sent'] = sentence
        ps_list.append(ps)
    
    df = pd.DataFrame(ps_list)

    return df

In [9]:
def add_sentiment_features(df_sentiment, symbol):
    '''Adds Sentiment Snalysis features, returns DataFrame'''
    
    #filter frames
    df_tmp = pd.DataFrame()
    pos = df_sentiment[df_sentiment['pos'] > df_sentiment['pos'].quantile(0.95)]
    neg = df_sentiment[df_sentiment['neg'] > df_sentiment['neg'].quantile(0.95)]

    #sentences features
    df_tmp.at[symbol, 'Mean Sent Len'] = df_sentiment['sent'].str.len().mean()
    df_tmp.at[symbol, 'Sent Count'] = df_sentiment.shape[0]

    #pos sentiment
    df_tmp.at[symbol, 'Pos Mean Sent Len'] = pos['sent'].str.len().mean()
    df_tmp.at[symbol, 'Pos Sent Count'] = pos.shape[0]
    pos = pos[['compound', 'neg', 'neu', 'pos']]
    df_tmp = pd.concat([pd.DataFrame(pos.mean(), columns=[symbol]).T.add_prefix('Pos Sent Signal '), df_tmp], axis=1)

    #pos sentiment
    df_tmp.at[symbol, 'Neg Mean Sent Len'] = neg['sent'].str.len().mean()
    df_tmp.at[symbol, 'Neg Sent Count'] = neg.shape[0]
    neg = neg[['compound', 'neg', 'neu', 'pos']]
    df_tmp = pd.concat([pd.DataFrame(neg.mean(), columns=[symbol]).T.add_prefix('Neg Sent Signal '), df_tmp], axis=1)
    
    return df_tmp

In [289]:
#add sentiment features
df_sentiment_features = pd.DataFrame()
counter = 0    

for x in df.index:
    try:
        counter += 1
        print('\n( ' + str(counter) + ' / ' + str(df.shape[0]) + ' ) ' + x)

        #check if raw data is available
        file_name = "./Data/" + x + ".htm"
        if Path(file_name).is_file():
            #load raw IPO filing
            with open(file_name, "r", encoding="utf-8") as file:
                html = file.read()
                soup = BeautifulSoup(html,"html5lib")
                text = soup.get_text(strip=True)

                #get sentiment
                df_sentiment = get_sentiment_df(text)
                df_sentiment_features = pd.concat([df_sentiment_features, add_sentiment_features(df_sentiment, x)], axis=0)
        else:
            print('no S-1 for ', x)
    except Exception as e:
        print(x, e)


( 1 / 951 ) WQNI
no S-1 for  WQNI

( 2 / 951 ) BBGI

( 3 / 951 ) UTSI
no S-1 for  UTSI

( 4 / 951 ) SLAB
no S-1 for  SLAB

( 5 / 951 ) WBSN
no S-1 for  WBSN

( 6 / 951 ) ALTH
no S-1 for  ALTH

( 7 / 951 ) MET
no S-1 for  MET

( 8 / 951 ) LPSN
no S-1 for  LPSN

( 9 / 951 ) HSTM
no S-1 for  HSTM

( 10 / 951 ) PXLW
no S-1 for  PXLW

( 11 / 951 ) CYH
no S-1 for  CYH

( 12 / 951 ) QBAK
no S-1 for  QBAK

( 13 / 951 ) CRL
no S-1 for  CRL

( 14 / 951 ) ACLS
no S-1 for  ACLS

( 15 / 951 ) SOHU
no S-1 for  SOHU

( 16 / 951 ) SRTI
no S-1 for  SRTI

( 17 / 951 ) PTIE
no S-1 for  PTIE

( 18 / 951 ) SPRT
no S-1 for  SPRT

( 19 / 951 ) SMTX

( 20 / 951 ) ARNA
no S-1 for  ARNA

( 21 / 951 ) ILMN
no S-1 for  ILMN

( 22 / 951 ) CAMT
no S-1 for  CAMT

( 23 / 951 ) EVC

( 24 / 951 ) DGEN
no S-1 for  DGEN

( 25 / 951 ) ERMS
no S-1 for  ERMS

( 26 / 951 ) LTRX
no S-1 for  LTRX

( 27 / 951 ) MDCO
no S-1 for  MDCO

( 28 / 951 ) EQIX
no S-1 for  EQIX

( 29 / 951 ) DRRX
no S-1 for  DRRX

( 30 / 951 ) MON

( 31


( 334 / 951 ) WSR
no S-1 for  WSR

( 335 / 951 ) COR

( 336 / 951 ) CCIH
no S-1 for  CCIH

( 337 / 951 ) STND

( 338 / 951 ) DQ
no S-1 for  DQ

( 339 / 951 ) TOWR

( 340 / 951 ) BBRG

( 341 / 951 ) VRA

( 342 / 951 ) GLTR

( 343 / 951 ) PACB

( 344 / 951 ) PSLV
no S-1 for  PSLV

( 345 / 951 ) SODA
no S-1 for  SODA

( 346 / 951 ) CMRE
no S-1 for  CMRE

( 347 / 951 ) PRMW

( 348 / 951 ) NOAH
no S-1 for  NOAH

( 349 / 951 ) LPLA

( 350 / 951 ) GM

( 351 / 951 ) WITE

( 352 / 951 ) TRGP

( 353 / 951 ) GCAP

( 354 / 951 ) RNET

( 355 / 951 ) WD

( 356 / 951 ) OSN
no S-1 for  OSN

( 357 / 951 ) AAT

( 358 / 951 ) BKU

( 359 / 951 ) INXN
no S-1 for  INXN

( 360 / 951 ) BCDS
no S-1 for  BCDS

( 361 / 951 ) NAGS

( 362 / 951 ) NPTN
no S-1 for  NPTN

( 363 / 951 ) PCRX

( 364 / 951 ) ECYT

( 365 / 951 ) BGMD

( 366 / 951 ) GEVO

( 367 / 951 ) INN

( 368 / 951 ) ACRX

( 369 / 951 ) CRUD

( 370 / 951 ) HCA
no S-1 for  HCA

( 371 / 951 ) MX

( 372 / 951 ) CSOD

( 373 / 951 ) SREV

( 374 / 951 ) AP


( 685 / 951 ) FGEN

( 686 / 951 ) LMRK

( 687 / 951 ) SKIS

( 688 / 951 ) HSGX

( 689 / 951 ) LC
no S-1 for  LC

( 690 / 951 ) NEWR

( 691 / 951 ) WK

( 692 / 951 ) ONDK

( 693 / 951 ) BLCM

( 694 / 951 ) ZSAN

( 695 / 951 ) ASND
no S-1 for  ASND

( 696 / 951 ) TCON

( 697 / 951 ) AVGR

( 698 / 951 ) SHAK

( 699 / 951 ) ONCE

( 700 / 951 ) DEA

( 701 / 951 ) INOV

( 702 / 951 ) BLPH

( 703 / 951 ) AJX

( 704 / 951 ) TONS

( 705 / 951 ) SMMT
no S-1 for  SMMT

( 706 / 951 ) NCOM
no S-1 for  NCOM

( 707 / 951 ) STDY

( 708 / 951 ) TANH
no S-1 for  TANH

( 709 / 951 ) JCAP

( 710 / 951 ) KRNT
no S-1 for  KRNT

( 711 / 951 ) ADRO

( 712 / 951 ) VIRT

( 713 / 951 ) EVA

( 714 / 951 ) BPMC

( 715 / 951 ) OPGN

( 716 / 951 ) HTGM

( 717 / 951 ) ADAP
no S-1 for  ADAP

( 718 / 951 ) TEGP

( 719 / 951 ) COLL

( 720 / 951 ) BOJA

( 721 / 951 ) RKDA

( 722 / 951 ) WING

( 723 / 951 ) CYAD
no S-1 for  CYAD

( 724 / 951 ) GNRT

( 725 / 951 ) GKOS

( 726 / 951 ) MCRN

( 727 / 951 ) LNTH

( 728 / 951 

In [292]:
df1 = pd.concat([standardize(df_sentiment_features), df], axis=1).dropna()

In [294]:
df1.to_csv('1 sentiment analysis.csv', index=False)

In [300]:
#now run ML flow with sentiment features
run_ml_flow(df1)

Unnamed: 0,1D,1W,1M,3M
AUC,0.531046,0.526413,0.496314,0.50841
f1,0.606061,0.610778,0.638298,0.677083
log loss,0.707982,0.740833,0.692042,0.667049


### Load Raw IPO

In [267]:
ipo = {}

for x in glob.glob("./Data/*.htm")[:5]:
    with open(x, "r", encoding="utf-8") as file:
        html = file.read()
        soup = BeautifulSoup(html,"html5lib")
        text = soup.get_text(strip=True)
        ipo[x.split('\\')[1].split('.')[0]] = text

In [268]:
ipo.keys()

dict_keys(['AAC', 'AACC', 'AACQU', 'AAHC', 'AAOI'])

# Summarization

In [301]:
import requests
import urllib.request

#response = urllib.request.urlopen('http://rare-technologies.com/the_matrix_synopsis.txt')
response = urllib.request.urlopen('https://www.sec.gov/Archives/edgar/data/1467623/000119312518055809/d451946ds1.htm')
html = response.read()
soup = BeautifulSoup(html,"html5lib")
text = soup.get_text(strip=True)

print('Summary:')
print(summarize(text, word_count=1000))

print('\nKeywords:')
print(keywords(text, words=100))

Summary:
These risks include, but are not limited to, the following:•Our business depends on our ability to retain and upgrade paying users, and any decline in renewals or upgrades could adversely affect our future results of operations.•Our future growth could be harmed if we fail to attract new users or convert registered users to paying users.•Our revenue growth rate has declined in recent periods and may continue to slow in the future.•We have a history of net losses, we anticipate increasing expenses in the future, and we may not be able to achieve or maintain profitability.•Our business could be damaged, and we could be subject to liability if there is any unauthorized access to our data or our users’ content, including through privacy and data security breaches.•Our business could be harmed by any significant disruption of service on our platform or loss of content.•We generate revenue from sales of subscriptions to our platform, and any decline in demand for our platform or for