In [None]:
###import libraries###
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import torch
#! pip install transformers
import transformers
from transformers import BertTokenizer, BertModel
import pickle

Upload data from local machine

In [None]:
#upload data from local file
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

#read in data as df
import io
df = pd.read_csv(io.BytesIO(uploaded['Cleaned_Train.csv']))

Saving Cleaned_Train.csv to Cleaned_Train (1).csv
User uploaded file "Cleaned_Train.csv" with length 852671 bytes


Alternatively, we can just use the google drive

In [None]:
train_path='/content/drive/MyDrive/MSCA Machine Learning/Data/cleaned_train.csv'
df=pd.read_csv(train_path)

In [None]:
#typecasting text input as str
df['text']=df['text'].astype('str')
#extract text input
text=list(df['text'])
# #extract test sample
# test=text[:10]

In [None]:
#initialkize tokenizer from base BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          output_hidden_states = True)
#initialize BERT model instance
model = BertModel.from_pretrained('bert-base-uncased', 
                                  return_dict=True)#return all hidden layers

In [None]:
#use to eval to ensure the model is only feeding forward
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

Below is a step-by-step demo of the embedding creation on a relatively small sample from training input (first 10 sentences)

In [None]:
#batch tokenize test input
test_batch = tokenizer(
    test,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
#fitting tokens through model
with torch.no_grad():

    outputs_batch = model(**test_batch)

In [None]:
#access hidden layer from model output
hidden_states_batch=outputs_batch[2]
#access only the second to last hidden layer
tokens_batch=hidden_states_batch[-2]
#check size; size correspondes to batch_size, number_of_tokens, and feature_number
tokens_batch.size()
#creating test embedding by taking the mean of feature per token for each sentence 
embeddings_batch = torch.mean(tokens_batch, dim=1, keepdim=True)
#test embedding size
print ('test batch embedding size is:', embeddings_batch.size())
#sqeeze the tensor to eliminate the "1" 
embeddings_batch=torch.squeeze(embeddings_batch, dim=1)
#test embedding size
print ('test batch embedding size is:', embeddings_batch.size())

test batch embedding size is: torch.Size([10, 1, 768])
test batch embedding size is: torch.Size([10, 768])


## Below will likely crash GPU runtime on Google Colab so I suggest running it on MSCA GPU instead

In [None]:
#batch tokenize test input
full_batch = tokenizer(
    text,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
#fitting tokens through model
with torch.no_grad():
    output_full = model(**full_batch)

In [None]:
#access hidden layer from model output
hidden_states_full=output_full[2]
#access only the second to last hidden layer
tokens_batch_full=hidden_states_full[-2]
#creating full embedding by taking the mean of feature per token for each sentence 
embeddings_full = torch.mean(tokens_batch_full, dim=1, keepdim=True)
#full embedding size
print ('full batch embedding size is:', embeddings_full.size())
#sqeeze the tensor to eliminate the "1" 
embeddings_full=torch.squeeze(embeddings_full, dim=1)
#full embedding size
print ('full batch embedding size is:', embeddings_full.size())

## Bert-as-a-service 

this is what we ended up using to actually produce the embedding - essentially an automated and more efficient way to carry out the above steps.

In [None]:
!pip install bert-serving-client
!pip install -U bert-serving-server[http]
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip
%tensorflow_version 1.x
!nohup bert-serving-start -model_dir=./uncased_L-12_H-768_A-12 > out.file 2>&1 &

Collecting bert-serving-client
  Downloading https://files.pythonhosted.org/packages/1f/09/aae1405378a848b2e87769ad89a43d6d71978c4e15534ca48e82e723a72f/bert_serving_client-1.10.0-py2.py3-none-any.whl
Installing collected packages: bert-serving-client
Successfully installed bert-serving-client-1.10.0
Collecting bert-serving-server[http]
[?25l  Downloading https://files.pythonhosted.org/packages/b0/bd/cab677bbd0c5fb08b72e468371d2bca6ed9507785739b4656b0b5470d90b/bert_serving_server-1.10.0-py3-none-any.whl (61kB)
[K     |████████████████████████████████| 71kB 7.8MB/s 
Collecting GPUtil>=1.3.0
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Collecting flask-compress; extra == "http"
  Downloading https://files.pythonhosted.org/packages/b2/7a/9c4641f975fb9daaf945dc39da6a52fd5693ab3bbc2d53780eab3b5106f4/Flask_Compress-1.8.0-py3-none-any.whl
Collecting flask-json; extra == "http"
  Downloading https:

In [None]:
#create BertClient
from bert_serving.client import BertClient
bc = BertClient()

In [None]:
#create embeddings on train data
emb=bc.encode(text)

In [None]:
#saving the embeddings in pickle
import pickle
with open('training_embeddings.pkl', 'wb') as fid:
     pickle.dump(emb, fid)

In [None]:
#uploading the embeddings to the 'data' google drive folder
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  
# get the folder id where you want to save your file
folder_id='1WBIJA0XTNaCrSxqH8Jc7I29FaNiwpH1u'
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('training_embeddings.pkl')
file.Upload() 
#access the uploaded embeddings
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import pickle
path="/content/drive/MyDrive/MSCA Machine Learning/Data/training_embeddings_with_target.pkl"
infile=open(path,'rb')
df_emb=pickle.load(infile)

In [None]:
df_emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,target
0,0.174006,0.51411,0.282695,-0.865944,-0.26295,0.222027,0.995218,0.638477,-0.662021,-0.256679,0.340827,-0.14874,-0.507322,0.185809,-0.69507,0.263941,-0.073811,0.211314,-0.320379,0.282107,0.190081,0.204111,0.01533,-0.08421,0.191036,0.191454,-0.310451,0.44101,-0.497746,-0.30581,0.202677,1.308998,0.080589,-0.039485,-0.503274,0.056791,0.226311,0.082208,-0.4029,1.006173,...,-0.235238,-0.388879,0.073767,0.008587,-0.539857,-0.047739,-0.010281,-0.419728,0.12905,0.420352,-0.248818,0.352863,0.034175,-0.115802,-0.190596,-0.017529,-0.133006,0.238757,-0.648669,0.179064,0.249245,0.387309,-0.379289,0.36922,-0.478311,-0.315277,0.202589,0.094307,0.506694,-0.038249,-0.348296,0.509817,0.030843,0.014934,0.092533,0.325911,-0.224219,0.322711,-0.324294,1
1,-0.083791,-0.074988,-0.035012,-0.029141,0.765842,-0.028787,0.210944,0.560736,-0.50872,-0.078661,0.015503,-0.240076,0.0303,0.505167,-0.771939,0.769955,-0.18915,0.045849,0.408051,-0.184479,-0.426469,-0.456381,0.211968,0.546523,0.081309,0.353223,0.061038,0.267759,0.023705,0.467865,-0.119265,0.135602,0.667777,-0.682075,0.025304,-0.40465,0.276614,0.250135,-0.095028,-0.023242,...,-0.760515,0.066294,0.237353,-0.00066,-1.157665,-0.138523,-0.487236,0.171328,0.237592,-0.005984,0.211551,-0.029149,0.314136,0.258066,-0.056516,-0.055411,0.407631,0.270228,-0.139126,-0.197396,-0.090131,0.154644,-0.219606,-0.07014,-0.303984,0.400976,-0.304344,-0.3431,-0.504064,-0.297995,0.050977,-0.087849,-0.461415,0.217056,-0.090924,0.25241,-0.205355,-0.545921,0.009871,1
2,-0.026818,0.038434,0.296934,-0.365698,0.76176,-0.738163,0.362888,0.261955,-0.415269,-0.113452,-0.358875,-0.566275,-0.225447,0.116963,-0.295379,0.501628,-0.085105,0.161736,-0.193539,0.16183,0.297279,-0.246689,-0.638786,-0.089641,0.471771,-0.113421,0.154899,-0.536939,-0.5278,-0.207359,0.385253,0.422704,-0.03432,-0.348092,0.204914,0.362816,-0.023397,0.142115,-0.391459,0.440117,...,-0.578197,0.18063,0.137681,-0.43909,-1.29447,0.122669,-0.545837,0.472996,0.241338,0.45699,0.111797,0.166603,-0.260541,0.13119,-0.260267,-0.198128,0.329539,0.036097,0.547372,-0.036892,0.041762,0.284138,0.193216,0.37669,0.173393,-0.182035,-0.321835,-0.256236,-0.34241,-0.127833,-0.430574,0.259555,-0.156196,0.047626,-0.047696,-0.631763,-0.277038,-0.518288,0.202233,1
3,0.070891,-0.082333,0.146175,-0.132793,0.334057,-0.225144,0.091203,0.108324,-0.622823,0.169655,0.05212,-0.058048,-0.199471,0.474092,-0.658516,0.695095,0.04622,-0.092253,-0.142309,0.136454,-0.071958,-0.386463,0.134039,0.261956,0.298472,-0.62178,0.282361,-0.507688,-0.128737,0.230739,0.347821,0.342002,-0.075256,-0.104258,-0.141361,0.372969,0.284278,-0.276344,-0.104445,0.508539,...,-0.468448,-0.128427,0.294985,-0.148065,-1.003905,0.224237,-0.224838,0.181639,0.619375,0.085027,0.217503,-0.080607,0.312273,-0.033001,-0.005642,0.226486,0.517302,-0.203843,0.072023,0.375318,0.06754,-0.793145,0.242947,0.023721,0.040188,-0.408251,-0.039337,-0.518061,-0.200081,-0.042007,0.207769,0.154214,-0.141324,0.320018,-0.010292,-0.14802,-0.443816,0.00793,-0.250166,1
4,0.447092,-0.136055,0.358419,0.133981,0.344892,-0.485013,0.121361,0.636551,-0.005755,0.001269,0.169808,-1.044109,0.071822,0.576235,-0.638417,0.647767,0.02066,0.508155,-0.441675,0.130068,0.013416,-0.327795,-0.040595,-0.006035,0.503706,-0.073351,-0.112548,0.344733,-0.455994,0.451291,0.164639,0.480351,0.712888,-0.305288,-0.130851,-0.360322,0.081524,-0.176666,-0.193298,0.305924,...,-0.33373,-0.205804,0.424565,-0.236491,-0.100342,-0.29416,-0.064662,0.278276,-0.008216,0.030658,0.237454,0.141825,-0.022997,-0.43389,-0.002547,-0.533061,0.284478,0.135287,0.067483,-0.002375,0.051481,-0.301331,-0.304217,0.316481,0.685986,0.048112,-0.261427,-0.135315,-0.55424,-0.071735,-0.228345,-0.523606,-0.023836,0.137001,0.076752,-0.54212,-0.700136,-0.108771,-0.287356,1


In [None]:
df_emb.to_csv('training_embeddings_with_target.csv')

In [None]:
#uploading the embeddings to the 'data' google drive folder
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  
# get the folder id where you want to save your file
folder_id='1WBIJA0XTNaCrSxqH8Jc7I29FaNiwpH1u'
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('training_embeddings_with_target.csv')
file.Upload() 

In [None]:
#create embedding with prediction class
df_emb=pd.DataFrame(emb)
df_emb['target']=df['target']
#checking out the new embedding with target
df_emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,target
0,0.174006,0.51411,0.282695,-0.865944,-0.26295,0.222027,0.995218,0.638477,-0.662021,-0.256679,0.340827,-0.14874,-0.507322,0.185809,-0.69507,0.263941,-0.073811,0.211314,-0.320379,0.282107,0.190081,0.204111,0.01533,-0.08421,0.191036,0.191454,-0.310451,0.44101,-0.497746,-0.30581,0.202677,1.308998,0.080589,-0.039485,-0.503274,0.056791,0.226311,0.082208,-0.4029,1.006173,...,-0.235238,-0.388879,0.073767,0.008587,-0.539857,-0.047739,-0.010281,-0.419728,0.12905,0.420352,-0.248818,0.352863,0.034175,-0.115802,-0.190596,-0.017529,-0.133006,0.238757,-0.648669,0.179064,0.249245,0.387309,-0.379289,0.36922,-0.478311,-0.315277,0.202589,0.094307,0.506694,-0.038249,-0.348296,0.509817,0.030843,0.014934,0.092533,0.325911,-0.224219,0.322711,-0.324294,1
1,-0.083791,-0.074988,-0.035012,-0.029141,0.765842,-0.028787,0.210944,0.560736,-0.50872,-0.078661,0.015503,-0.240076,0.0303,0.505167,-0.771939,0.769955,-0.18915,0.045849,0.408051,-0.184479,-0.426469,-0.456381,0.211968,0.546523,0.081309,0.353223,0.061038,0.267759,0.023705,0.467865,-0.119265,0.135602,0.667777,-0.682075,0.025304,-0.40465,0.276614,0.250135,-0.095028,-0.023242,...,-0.760515,0.066294,0.237353,-0.00066,-1.157665,-0.138523,-0.487236,0.171328,0.237592,-0.005984,0.211551,-0.029149,0.314136,0.258066,-0.056516,-0.055411,0.407631,0.270228,-0.139126,-0.197396,-0.090131,0.154644,-0.219606,-0.07014,-0.303984,0.400976,-0.304344,-0.3431,-0.504064,-0.297995,0.050977,-0.087849,-0.461415,0.217056,-0.090924,0.25241,-0.205355,-0.545921,0.009871,1
2,-0.026818,0.038434,0.296934,-0.365698,0.76176,-0.738163,0.362888,0.261955,-0.415269,-0.113452,-0.358875,-0.566275,-0.225447,0.116963,-0.295379,0.501628,-0.085105,0.161736,-0.193539,0.16183,0.297279,-0.246689,-0.638786,-0.089641,0.471771,-0.113421,0.154899,-0.536939,-0.5278,-0.207359,0.385253,0.422704,-0.03432,-0.348092,0.204914,0.362816,-0.023397,0.142115,-0.391459,0.440117,...,-0.578197,0.18063,0.137681,-0.43909,-1.29447,0.122669,-0.545837,0.472996,0.241338,0.45699,0.111797,0.166603,-0.260541,0.13119,-0.260267,-0.198128,0.329539,0.036097,0.547372,-0.036892,0.041762,0.284138,0.193216,0.37669,0.173393,-0.182035,-0.321835,-0.256236,-0.34241,-0.127833,-0.430574,0.259555,-0.156196,0.047626,-0.047696,-0.631763,-0.277038,-0.518288,0.202233,1
3,0.070891,-0.082333,0.146175,-0.132793,0.334057,-0.225144,0.091203,0.108324,-0.622823,0.169655,0.05212,-0.058048,-0.199471,0.474092,-0.658516,0.695095,0.04622,-0.092253,-0.142309,0.136454,-0.071958,-0.386463,0.134039,0.261956,0.298472,-0.62178,0.282361,-0.507688,-0.128737,0.230739,0.347821,0.342002,-0.075256,-0.104258,-0.141361,0.372969,0.284278,-0.276344,-0.104445,0.508539,...,-0.468448,-0.128427,0.294985,-0.148065,-1.003905,0.224237,-0.224838,0.181639,0.619375,0.085027,0.217503,-0.080607,0.312273,-0.033001,-0.005642,0.226486,0.517302,-0.203843,0.072023,0.375318,0.06754,-0.793145,0.242947,0.023721,0.040188,-0.408251,-0.039337,-0.518061,-0.200081,-0.042007,0.207769,0.154214,-0.141324,0.320018,-0.010292,-0.14802,-0.443816,0.00793,-0.250166,1
4,0.447092,-0.136055,0.358419,0.133981,0.344892,-0.485013,0.121361,0.636551,-0.005755,0.001269,0.169808,-1.044109,0.071822,0.576235,-0.638417,0.647767,0.02066,0.508155,-0.441675,0.130068,0.013416,-0.327795,-0.040595,-0.006035,0.503706,-0.073351,-0.112548,0.344733,-0.455994,0.451291,0.164639,0.480351,0.712888,-0.305288,-0.130851,-0.360322,0.081524,-0.176666,-0.193298,0.305924,...,-0.33373,-0.205804,0.424565,-0.236491,-0.100342,-0.29416,-0.064662,0.278276,-0.008216,0.030658,0.237454,0.141825,-0.022997,-0.43389,-0.002547,-0.533061,0.284478,0.135287,0.067483,-0.002375,0.051481,-0.301331,-0.304217,0.316481,0.685986,0.048112,-0.261427,-0.135315,-0.55424,-0.071735,-0.228345,-0.523606,-0.023836,0.137001,0.076752,-0.54212,-0.700136,-0.108771,-0.287356,1


In [None]:
#pickle full embeddings with feature
with open('training_embeddings_with_target.pkl', 'wb') as fid:
     pickle.dump(df_emb, fid)

#uploading the embeddings to the 'data' google drive folder
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  
#uploading to data folder
folder_id='1WBIJA0XTNaCrSxqH8Jc7I29FaNiwpH1u'
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('training_embeddings_with_target.pkl')
file.Upload() 

In [None]:
#access the uploaded embeddings
from google.colab import drive
drive.mount("/content/drive")
path="/content/drive/MyDrive/MSCA Machine Learning/Data/training_embeddings.pkl"
infile=open(path,'rb')
emb=pickle.load(infile)

Below is naive bayes using TF-IDF embedding for reference only

In [None]:
X=df['text']
y=df['target']
print ('intput variable shape is', X.shape)
print ('intput target shape is', y.shape)

intput variable shape is (7613,)
intput target shape is (7613,)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
vect = TfidfVectorizer(stop_words='english')
X_train_m = vect.fit_transform(X_train)
X_test_m = vect.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_m, y_train)
y_pred=nb.predict(X_test_m)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
# calculate accuracy of class predictions
print(f"Test Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print(classification_report(y_test, y_pred))

Test Accuracy: 79.4%
              precision    recall  f1-score   support

           0       0.76      0.91      0.83      1049
           1       0.86      0.65      0.74       855

    accuracy                           0.79      1904
   macro avg       0.81      0.78      0.78      1904
weighted avg       0.80      0.79      0.79      1904

