# Sentiment Analysis with CLS Token from BERT with SST Dataset

We will train with SST dataset and test with 2000+2000 SST dataset

In [1]:
#import necessary library & settings
import torch
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import warnings
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
#HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00 ", "#FF006D", "#ADFF02", "#8F00FF"]
#sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
warnings.filterwarnings('ignore')

cuda:0


# Let's understand the classes distribution for SST dataset

### We select the first 8000 English and Malay dataset for experiment

In [2]:
import pandas as pd
# Read train data
column_names = ["sentence","label"]
df = pd.DataFrame(columns = column_names)
df_en = pd.read_csv('C:/Users/Yuheng/sst_train.txt', sep='\t', header=None, names=['truth', 'text'],encoding='latin-1')
df_en = df_en[:8000]
df_en['truth'] = df_en['truth'].str.replace('__label__', '')
df_en['truth'] = df_en['truth'].astype(int).astype('category')
df_en['sentence']=df_en['text']
df_en['label']=df_en['truth']

df_malay = pd.read_csv('Desktop/sst-5_malay.csv', header=0, names=['sentence', 'label'],encoding='latin-1')
df_malay = df_malay[:8000]

#### We want only 3 classes, so we will combine related classes

In [3]:
def to_sentiment(rating):
  rating = int(rating)
  if rating == 1:
    return 0
  elif rating == 2:
    return 0
  elif rating == 3:
    return 1
  elif rating == 4:
    return 2
  else:
    return 2
df_en['label'] = df_en.label.apply(to_sentiment)
class_names = ['Negative','Neutral','Positive']
df_malay['label']=df_malay.label.apply(to_sentiment)


## Now, we load mBERT and put it into GPU

In [4]:
PRE_TRAINED_MODEL_NAME = 'bert-base-multilingual-cased'

In [5]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [6]:
class_names = ['Negative','Neutral', 'Positive']
#model = SentimentClassifier(len(class_names))
model = bert_model.to(device)

## First round of dataset splitting to get testing dataset

In [7]:
train_features_en, test_features_en, train_labels_en, test_labels_en = train_test_split(df_en['sentence'], df_en['label'], random_state=42)

In [8]:
train_features_malay, test_features_malay, train_labels_malay, test_labels_malay = train_test_split(df_malay['sentence'], df_malay['label'], random_state=42)

#### We obtain the CLS token from mBERT for every sentence in the dataset as our embeddings and saved into "encoded"

In [9]:
#extract CLS token for every sentence
encoded_en=[]
for i in range (0,len(df_en['sentence'])):
    encoded_review = tokenizer.encode_plus(
      df_en['sentence'][i],
      max_length=512,
      truncation=True,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    feature = last_hidden_states
    features = (feature[0][:,0,:].cpu()).numpy().flatten() #extract the last CLS token from BERT-layer and flatten into 1d array
    encoded_en.append(features)
    

In [10]:
#extract CLS token for every sentence
encoded_malay=[]
for i in range (0,len(df_malay['sentence'])):
    encoded_review = tokenizer.encode_plus(
      df_malay['sentence'][i],
      max_length=512,
      truncation=True,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    feature = last_hidden_states
    features = (feature[0][:,0,:].cpu()).numpy().flatten() #extract the last CLS token from BERT-layer and flatten into 1d array
    encoded_malay.append(features)

#### Don't forget to pass the sentiment to labels

In [11]:
labels_en=df_en['label']
labels_malay=df_malay['label']

# Classifier Training
#### Second round splitting for classifier training

In [13]:
train_features_new_en, test_features_new_en, train_labels_new_en, test_labels_new_en = train_test_split(encoded_en, labels_en, random_state=42)

In [14]:
train_features_new_malay, test_features_new_malay, train_labels_new_malay, test_labels_new_malay = train_test_split(encoded_malay, labels_malay, random_state=42)

# Combine testing data (2000+2000)

In [15]:
column_names = ["column","label"]
df_test_new = pd.DataFrame(test_labels_en,columns = column_names)
df_test_new['sentence']=test_features_en
#df_test_new['label']=test_labels_en
df_test_new=df_test_new.drop(columns=["column"])
df_test_new['lan']=0

In [16]:
column_names = ["column","label"]
df_test_new_malay = pd.DataFrame(test_labels_malay,columns = column_names)
df_test_new_malay['sentence']=test_features_malay
#df_test_new['label']=test_labels_en
df_test_new_malay=df_test_new_malay.drop(columns=["column"])
df_test_new_malay['lan']=1

In [17]:
df_test_new=df_test_new.append(df_test_new_malay,ignore_index = True)
df_test_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   label     4000 non-null   int64 
 1   sentence  4000 non-null   object
 2   lan       4000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 93.9+ KB


In [18]:
df_test_new.tail()

Unnamed: 0,label,sentence,lan
3995,0,Blue Crush begitu berpanjangan dan membosankan...,1
3996,2,Kelegaan dari filem besbol yang terlalu keras ...,1
3997,1,Orang miskin Stuart memerlukan sejumlah sinis ...,1
3998,0,"Tidak ada jumlah pembakaran, peledakan, tusuka...",1
3999,2,Berjaya mencapai apa yang dapat dilakukan oleh...,1


#### Let's train our English classifier

In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import time
start_time = time.time()
svc_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc_clf.fit(train_features_new_en,train_labels_new_en)
svc_score=svc_clf.score(test_features_new_en, test_labels_new_en)
print(svc_score)
print("--- %s seconds ---" % (time.time() - start_time))

0.6075
--- 19.230565786361694 seconds ---


In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
import time
start_time = time.time()
pca_en = PCA(n_components=200)
x_en_pca = pca_en.fit_transform(encoded_en)
train_features_en_pca, test_features_en_pca, train_labels_en_pca, test_labels_en_pca = train_test_split(x_en_pca, labels_en, random_state=42)
svc_en_clf_pca = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc_en_clf_pca.fit(train_features_en_pca,train_labels_en_pca)
svc_en_pca_score=svc_en_clf_pca.score(test_features_en_pca, test_labels_en_pca)
print(svc_en_pca_score)
print("--- %s seconds ---" % (time.time() - start_time))

0.594
--- 4.811445236206055 seconds ---


#### Let's train our Malay classifier

In [21]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import time
start_time = time.time()
svc_malay_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc_malay_clf.fit(train_features_new_malay,train_labels_new_malay)
svc_malay_score=svc_malay_clf.score(test_features_new_malay, test_labels_new_malay)
print(svc_malay_score)
print("--- %s seconds ---" % (time.time() - start_time))

0.5685
--- 20.03687310218811 seconds ---


In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
import time
start_time = time.time()
pca_malay = PCA(n_components=200)
x_malay_pca = pca_malay.fit_transform(encoded_malay)
train_features_malay_pca, test_features_malay_pca, train_labels_malay_pca, test_labels_malay_pca = train_test_split(x_malay_pca, labels_malay, random_state=42)
svc_malay_clf_pca = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc_malay_clf_pca.fit(train_features_malay_pca,train_labels_malay_pca)
svc_malay_pca_score=svc_malay_clf_pca.score(test_features_malay_pca, test_labels_malay_pca)
print(svc_malay_pca_score)
print("--- %s seconds ---" % (time.time() - start_time))

0.5675
--- 4.90656042098999 seconds ---


In [24]:
df_test_new.tail()

Unnamed: 0,label,sentence,lan
3995,0,Blue Crush begitu berpanjangan dan membosankan...,1
3996,2,Kelegaan dari filem besbol yang terlalu keras ...,1
3997,1,Orang miskin Stuart memerlukan sejumlah sinis ...,1
3998,0,"Tidak ada jumlah pembakaran, peledakan, tusuka...",1
3999,2,Berjaya mencapai apa yang dapat dilakukan oleh...,1


In [25]:
import requests
def langdetect(text):
    src='auto'##english, can chg to \"auto\" for language detection
    dest='ms' #malay
    url = "https://clients5.google.com/translate_a/t?client=dict-chrome-ex&sl="+src+"&tl="+dest+"&q=" + text
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
    try:
        request_result = requests.get(url, headers=headers).json()     
        #print(request_result)
        #print('-------')
        #print('[In Malay]: ' + request_result['alternative_translations'][0]['alternative'][0]['word_postproc'])
        #result=request_result['alternative_translations'][0]['alternative'][0]['word_postproc'] #get translated result
        result=request_result['src'] ## get the detected language
        #print('[Language Dectected]: ' + request_result['src'])
    except:
        pass
    return result

#### Using Google Translate API to detect language with SST Mix dataset with 768 features

In [26]:
#extract CLS token for every sentence
match=0
false=0
false_lan=0
match_lan=0
i=0
start_time = time.time()
for i in range (0,len(df_test_new['sentence'])):
    encoded_review = tokenizer.encode_plus(
      df_test_new['sentence'][i],
      max_length=512,
      truncation=True,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    feature = last_hidden_states
    features = (feature[0][:,0,:].cpu()).numpy() #extract the last CLS token from BERT-layer
    #prediction_language=int(svc_lan_clf.predict(features))
    #print('Label:',prediction)
    #print(f'Sentiment  : {class_names[prediction]}')
    lan=langdetect(df_test_new['sentence'][i])
    #print(lan)
    if(lan=='en'):
        prediction_language=0
    elif(lan=='ms'):
        prediction_language=1
    else:
        prediction_language=1
    if (df_test_new['lan'][i]==prediction_language):
        match_lan+=1
        if(prediction_language==0):#en
            prediction=int(svc_clf.predict(features))
            if (df_test_new['label'][i]==prediction): 
                match+=1
            elif(df_test_new['label'][i]!=prediction):
                false+=1
        elif(prediction_language==1): #malay
            prediction=int(svc_malay_clf.predict(features))
            if (df_test_new['label'][i]==prediction): 
                match+=1
            elif(df_test_new['label'][i]!=prediction):
                false+=1
    elif(df_test_new['lan'][i]!=prediction_language):
        false_lan+=1
            

accuracy_sentiment=((match)/len(df_test_new['sentence']))*100
print('Accuracy of Sentiment:',"{:.2f}".format(round(accuracy_sentiment, 2)),'%')


accuracy_language=(match_lan/len(df_test_new['sentence']))*100
print('Accuracy of Language:',"{:.2f}".format(round(accuracy_language, 2)),'%')
print("--- %s seconds ---" % (time.time() - start_time))

Accuracy of Sentiment: 58.75 %
Accuracy of Language: 99.78 %
--- 591.8641953468323 seconds ---


#### Using Google Translate API to detect language with SST Mix dataset with PCA 200D

In [27]:
#extract CLS token for every sentence
match=0
match_en=0
match_malay=0
false=0
false_en=0
false_malay=0
false_lan=0
match_lan=0

i=0
start_time = time.time()
for i in range (0,len(df_test_new['sentence'])):
    encoded_review = tokenizer.encode_plus(
      df_test_new['sentence'][i],
      max_length=512,
      truncation=True,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    feature = last_hidden_states
    features = (feature[0][:,0,:].cpu()).numpy() #extract the last CLS token from BERT-layer
    #prediction_language=int(svc_lan_clf.predict(features))
    #print('Label:',prediction)
    #print(f'Sentiment  : {class_names[prediction]}')
    lan=langdetect(df_test_new['sentence'][i])
    #print(lan)
    if(lan=='en'):
        prediction_language=0
    elif(lan=='ms'):
        prediction_language=1
    else:
        prediction_language=1
    if (df_test_new['lan'][i]==prediction_language):
        match_lan+=1
        if(prediction_language==0):#en
            prediction=int(svc_en_clf_pca.predict(pca_en.transform(features)))
            if (df_test_new['label'][i]==prediction): 
                match+=1
                match_en+=1
            elif(df_test_new['label'][i]!=prediction):
                false+=1
                false_en+=1
        elif(prediction_language==1): #malay
            prediction=int(svc_malay_clf_pca.predict(pca_malay.transform(features)))
            if (df_test_new['label'][i]==prediction): 
                match+=1
                match_malay+=1
            elif(df_test_new['label'][i]!=prediction):
                false+=1
                false_malay+=1
    elif(df_test_new['lan'][i]!=prediction_language):
        false_lan+=1
            
print('English misclassification:', false_en)
print('Malay misclassification:', false_malay)
accuracy_sentiment=((match)/len(df_test_new['sentence']))*100
print('Accuracy of Sentiment:',"{:.2f}".format(round(accuracy_sentiment, 2)),'%')


accuracy_language=(match_lan/len(df_test_new['sentence']))*100
print('Accuracy of Language:',"{:.2f}".format(round(accuracy_language, 2)),'%')
print("--- %s seconds ---" % (time.time() - start_time))

English misclassification: 806
Malay misclassification: 857
Accuracy of Sentiment: 58.20 %
Accuracy of Language: 99.78 %
--- 402.8547532558441 seconds ---
