In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import logging

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

logging.basicConfig(level=logging.INFO)

In [1]:
!pip install sentencepiece ipython-autotime gpustat

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import tensorflow_hub as hub
import tokenization
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.
INFO:absl:Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2, Total size: 421.50MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.


In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd gdrive/MyDrive/News+LPReports/News-Classifiers

/content/gdrive/MyDrive/News+LPReports/News-Classifiers


# Load data

In [None]:
news = pd.read_excel('news_tagged_130521.xlsx', sheet_name="News Articles")
news.head(2)

Unnamed: 0,ID,article_number,time_period,company_name,article,revenue_yj,1. revenue,product_yj,2. pdt_dev,market_yj,3. mkt_dev,partnership_yj,4. partnership,mgmt_yj,5. mgt change,non-mgmt_yj,6. non-mgt change,clinical_yj,7. clinical,8. lawsuit,fundraising_yj,9. fundraising,acquisition_yj,10. acquisition,11. Competitors' fundraising,entity (WIP)
0,1,0.0,1.0,17Live,"Pioneering live streaming app MeMe Live, which...",,0.0,,0.0,,1.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,,0.0,,1.0
1,2,0.0,1.0,17Live,With the Covid-19 pandemic triggering some pro...,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,,0.0,,1.0


In [None]:
df = news.iloc[:, 0:5]
df.head()

Unnamed: 0,ID,article_number,time_period,company_name,article
0,1,0.0,1.0,17Live,"Pioneering live streaming app MeMe Live, which..."
1,2,0.0,1.0,17Live,With the Covid-19 pandemic triggering some pro...
2,3,0.0,1.0,17Live,"In the recent past, the live streaming platfor..."
3,4,0.0,1.0,17Live,It brings people a joyful experience by creati...
4,5,0.0,1.0,17Live,In order to ensure the sustainability of the m...


In [None]:
df.shape

(41484, 5)

# BERT functions

In [None]:
# create encodings for BERT
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
# build BERT model with neural network classifer
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(2, activation='softmax')(net)
    
    # activation change to sigmoid for multilabel
    # loss is bce loss
    # metrics accuracy

    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['categorical_accuracy'])
    
    return model

In [None]:
# encode predictions into labels
# first element is for 0
# second element is for 1
def encode(row):
  i = row[0]
  if round(i) == 0:
    val = 1
  else:
    val = 0
  return val

# Run predictions

In [None]:
%load_ext autotime
# display time for each cell execution

time: 70.2 µs (started: 2021-06-01 00:53:27 +00:00)


In [None]:
max_len = 150 # of the sentence
input = bert_encode(df.article.values, tokenizer, max_len=max_len)

model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 150)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
signals = ['revenue','product','market','partnership','mgmt','clinical','fundraising']
paths = ['models/news_rev.h5', 'models/news_pdt.h5', 'models/news_mkt.h5', 'models/news_partnership.h5', 
          'models/news_mgmt_combine.h5', 'models/news_clinical.h5', 'models/news_fundraising.h5']

time: 1.48 ms (started: 2021-05-31 08:43:05 +00:00)


In [None]:
for idx, signal in enumerate(signals):
  begin_time = datetime.datetime.now() # track run-time

  # get the path from the appropriate index in corresponding list
  path = paths[idx]

  # load and predict with each set of weights
  model.load_weights(path)
  pred = model.predict(input)

  pred_list = pred.tolist()
  # save predictions back to dataframe under the relevant column
  df[signal] = pred_list
  # encode into 1 or 0
  df[signal] = df[signal].apply(lambda x: encode(x))
  print(str(signal) + ' predictions have been completed!')
  print(datetime.datetime.now() - begin_time)

revenue predictions have been completed!
0:08:23.240384
product predictions have been completed!
0:08:29.511647
market predictions have been completed!
0:08:27.121358
partnership predictions have been completed!
0:07:50.429723
mgmt predictions have been completed!
0:07:50.973915
clinical predictions have been completed!
0:08:26.580262
fundraising predictions have been completed!
0:07:51.033583
time: 57min 18s (started: 2021-05-31 08:47:44 +00:00)


In [None]:
df.head()

Unnamed: 0,ID,article_number,time_period,company_name,article,revenue,product,market,partnership,mgmt,clinical,fundraising
0,1,0.0,1.0,17Live,"Pioneering live streaming app MeMe Live, which...",0,0,1,0,0,0,0
1,2,0.0,1.0,17Live,With the Covid-19 pandemic triggering some pro...,0,0,1,0,0,0,0
2,3,0.0,1.0,17Live,"In the recent past, the live streaming platfor...",0,0,0,0,0,0,0
3,4,0.0,1.0,17Live,It brings people a joyful experience by creati...,0,0,0,0,0,0,0
4,5,0.0,1.0,17Live,In order to ensure the sustainability of the m...,0,0,1,1,0,0,0


time: 32.5 ms (started: 2021-05-31 09:45:04 +00:00)


In [None]:
df.to_csv('full_news_predictions.csv')

time: 337 ms (started: 2021-05-31 09:45:04 +00:00)


In [None]:
# # check GPU RAM usage
# !gpustat --watch

model.load_weights('models/news_rev.h5')
test_pred = model.predict(input)
list2 = test_pred.tolist()
df['growth_pred'] = list2
df['growth_pred'] = df['growth_pred'].apply(lambda x: encode(x))


time: 7min 39s (started: 2021-06-01 00:54:06 +00:00)


In [None]:
df.head()

Unnamed: 0,ID,article_number,time_period,company_name,article,growth_pred
0,1,0.0,1.0,17Live,"Pioneering live streaming app MeMe Live, which...",0
1,2,0.0,1.0,17Live,With the Covid-19 pandemic triggering some pro...,0
2,3,0.0,1.0,17Live,"In the recent past, the live streaming platfor...",0
3,4,0.0,1.0,17Live,It brings people a joyful experience by creati...,0
4,5,0.0,1.0,17Live,In order to ensure the sustainability of the m...,0


time: 17.5 ms (started: 2021-06-01 01:02:05 +00:00)


# Min-max normalization

In [None]:
df = pd.read_csv('100621_signal_predictions.csv', index_col=0)
df.head(3)

Unnamed: 0,Search Query,Date,article,revenue,product,market,partnership,mgmt,clinical,fundraising
0,Binance,2021-05-27 22:50:00+00:00,It is already impossible for a single entity t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Binance,2021-05-27 22:50:00+00:00,"“I don’t think anyone can shut it down now, gi...",0.0,0.0,0.47074,0.0,0.0,0.0,0.0
2,Binance,2021-05-27 22:50:00+00:00,“You can’t erase that.” Fighting off bitcoin a...,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
signals = ['revenue','product','market','partnership','mgmt','clinical','fundraising']
for signal in signals:
  df[signal] = (df[signal] - df[signal].min()) / (df[signal].max() - df[signal].min()) 
df.head()

Unnamed: 0,Search Query,Date,article,revenue,product,market,partnership,mgmt,clinical,fundraising
0,Binance,2021-05-27 22:50:00+00:00,It is already impossible for a single entity t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Binance,2021-05-27 22:50:00+00:00,"“I don’t think anyone can shut it down now, gi...",0.0,0.0,0.942239,0.0,0.0,0.0,0.0
2,Binance,2021-05-27 22:50:00+00:00,“You can’t erase that.” Fighting off bitcoin a...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Binance,2021-05-27 22:50:00+00:00,Cryptocurrencies are not here to kill traditio...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Binance,2021-05-27 22:50:00+00:00,“I don’t view them as competing with regulator...,0.0,0.0,0.843162,0.0,0.0,0.0,0.0


In [None]:
front_df = df.loc[:, 'Search Query': 'article']
back_df = df.loc[:, 'revenue':'fundraising']

In [None]:
m = np.zeros_like(back_df.values)
m[np.arange(len(df)), back_df.values.argmax(1)] = 1

df1 = pd.DataFrame(m, columns = back_df.columns).astype(int)
df1.head()

Unnamed: 0,revenue,product,market,partnership,mgmt,clinical,fundraising
0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0


In [None]:
final_df = pd.concat([front_df, df1], axis=1)
final_df.head()

Unnamed: 0,Search Query,Date,article,revenue,product,market,partnership,mgmt,clinical,fundraising
0,Binance,2021-05-27 22:50:00+00:00,It is already impossible for a single entity t...,1,0,0,0,0,0,0
1,Binance,2021-05-27 22:50:00+00:00,"“I don’t think anyone can shut it down now, gi...",0,0,1,0,0,0,0
2,Binance,2021-05-27 22:50:00+00:00,“You can’t erase that.” Fighting off bitcoin a...,1,0,0,0,0,0,0
3,Binance,2021-05-27 22:50:00+00:00,Cryptocurrencies are not here to kill traditio...,1,0,0,0,0,0,0
4,Binance,2021-05-27 22:50:00+00:00,“I don’t view them as competing with regulator...,0,0,1,0,0,0,0
