In [159]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [160]:
PATH_TO_DATA = '/content/drive/MyDrive/dataset'

In [161]:
!pip install bert-for-tf2 
!pip install sentencepiece



In [162]:
try: 
  %tensorflow_version 2.x 
except Exception: 
  pass 
import tensorflow as tf 

import tensorflow_hub as hub 

from tensorflow.keras import layers 
import bert 
import pandas as pd
import re

In [163]:
data = pd.read_csv(PATH_TO_DATA + "/bloomberg_marked_df (2).csv") 
data.isnull().values.any() 
data.shape

(1912, 3)

In [164]:
def preprocess_text(sen): 
  # Removing html tags 
  sentence = remove_tags(sen) 

  # Remove punctuations and numbers 
  sentence = re.sub('[^a-zA-Z]', ' ', sentence) 

  # Single character removal 
  sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) 

  # Removing multiple spaces 
  sentence = re.sub(r'\s+', ' ', sentence) 

  return sentence 

def remove_tags(text):
  TAG_RE = re.compile(r'<[^>]+>,.') 
  return TAG_RE.sub('', text) 

In [165]:
news = [] 
sentences = list(data['news']) 
for sen in sentences: 
  news.append(preprocess_text(sen)) 

In [166]:
data.head()

Unnamed: 0,mark,date,news
0,1,2021-07-22,India’s Environment Ministry has almost halved...
1,1,2021-07-22,Singapore’s home price growth slowed in the se...
2,1,2021-07-22,"What keeps breaking the buck, is engaged in fi..."
3,1,2021-07-22,Malaysia has confirmed one case of an individu...
4,1,2021-07-22,"Xponential Fitness Inc., the franchise owner o..."


In [167]:
BertTokenizer = bert.bert_tokenization.FullTokenizer 
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", 
trainable=False) 
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() 
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy() 
tokenizer = BertTokenizer(vocabulary_file, to_lower_case) 

In [168]:
def tokenize_news(text_news): 
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_news)) 

In [169]:
tokenized_news = [tokenize_news(new) for new in news] 

In [170]:
news_tokenized = data.assign(tokenized = tokenized_news)

In [171]:
print(news_tokenized)

      mark  ...                                          tokenized
0        1  ...  [2634, 4044, 3757, 2038, 2471, 11085, 7178, 19...
1        1  ...  [5264, 2188, 3976, 3930, 9784, 1999, 1996, 211...
2        1  ...  [2054, 7906, 4911, 1996, 10131, 2003, 5117, 19...
3        1  ...  [6027, 2038, 4484, 2028, 2553, 1997, 2019, 326...
4        1  ...  [26726, 5643, 19909, 10516, 4297, 1996, 6329, ...
...    ...  ...                                                ...
1907     0  ...  [1996, 3484, 1997, 7273, 2015, 2024, 2025, 851...
1908     0  ...  [3696, 2039, 2005, 1996, 2047, 4610, 3679, 171...
1909     0  ...  [14455, 12030, 3062, 1996, 2087, 1999, 2698, 2...
1910     0  ...  [3696, 2039, 2005, 1996, 2047, 4610, 3679, 171...
1911     0  ...  [3582, 2149, 19888, 2080, 2005, 2256, 2440, 63...

[1912 rows x 4 columns]


In [172]:
news_tokenized.to_csv(PATH_TO_DATA +'/bloomberg_tokenized.csv')