# Using Lang Model for Classification

In [1]:
## Based on
#  - https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/nlp/03_Sentence_Classification_with_BERT.ipynb
#  - Explained with graphics at: https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb

In [2]:
## If not already installed
# !pip install transformers

In [3]:
# Imports for data loading
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

### Loading data

In [4]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [5]:
# See a sample
df.head

<bound method NDFrame.head of                                                       0  1
0     a stirring , funny and finally transporting re...  1
1     apparently reassembled from the cutting room f...  0
2     they presume their audience wo n't sit still f...  0
3     this is a visually stunning rumination on love...  1
4     jonathan parker 's bartleby should have been t...  1
...                                                 ... ..
6915  painful , horrifying and oppressively tragic ,...  1
6916  take care is nicely performed by a quintet of ...  0
6917  the script covers huge , heavy topics in a bla...  0
6918  a seriously bad film with seriously warped log...  0
6919  a deliciously nonsensical comedy about a city ...  1

[6920 rows x 2 columns]>

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6920 entries, 0 to 6919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       6920 non-null   object
 1   1       6920 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 108.2+ KB


In [7]:
# Looking at a sample
batch_1 = df[:2000]

In [8]:
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

### Lang model - pretrained

In [9]:
# For lang model
import transformers as ppb

In [10]:
# Loading BERT
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')


# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

### Represent content based on model

In [11]:
# Tokenize text
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized.head

<bound method NDFrame.head of 0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
1995    [101, 2205, 20857, 1998, 11865, 16643, 2135, 5...
1996    [101, 2009, 2515, 1050, 1005, 1056, 2147, 2004...
1997    [101, 2023, 2028, 8704, 2005, 1996, 11848, 199...
1998    [101, 1999, 1996, 2171, 1997, 2019, 9382, 1898...
1999    [101, 1996, 3185, 2003, 25757, 2011, 1037, 244...
Name: 0, Length: 2000, dtype: object>

In [12]:
# Import for data processing .. some more
import numpy as np

In [13]:
# Padded input
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [14]:
# What is the shape? Just to check
np.array(padded).shape

(2000, 59)

In [15]:
print (padded[0])

[  101  1037 18385  1010  6057  1998  2633 18276  2128 16603  1997  5053
  1998  1996  6841  1998  5687  5469  3152   102     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0]


In [16]:
# Mask of where padding has happened
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

In [17]:
print (attention_mask[0])

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


### Setup for learning with Torch

In [18]:
# Import for learning
import torch

In [20]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [21]:
# BERT does sentence classification and adds a token called [CLS] (for classification) at 
# the beginning of every sentence.
features = last_hidden_states[0][:,0,:].numpy()
print (features)

[[-0.21593434 -0.14028913  0.00831103 ... -0.13694862  0.5867002
   0.20112705]
 [-0.17262721 -0.14476167  0.00223418 ... -0.17442568  0.21386462
   0.37197474]
 [-0.0506333   0.07203949 -0.02959686 ... -0.0714895   0.718524
   0.26225498]
 ...
 [-0.27829766 -0.24803604  0.135858   ... -0.19039157  0.13099585
   0.34978378]
 [-0.03667738  0.10638569 -0.01111003 ... -0.11206663  0.41619474
   0.5033801 ]
 [ 0.12402616  0.01425182  0.01038423 ... -0.11606551  0.53459156
   0.2749535 ]]


In [22]:
# The labels of original data
labels = batch_1[1]
print (labels)

0       1
1       0
2       0
3       1
4       1
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Name: 1, Length: 2000, dtype: int64


### For classification of output

In [23]:
# Imports for final classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [24]:
# Train, test data splot
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [25]:
# Logistic regression
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression()

In [26]:

lr_clf.score(test_features, test_labels)

0.856

In [27]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.8093333333333333


In [28]:
# Logistic regression with best parameter
lr_clf = LogisticRegression(C=5.263252631578947)
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)

0.85

In [29]:
# Comparison with a dummy baseline
# - https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
# - Default is a predictor respecting input distribution of labels
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.500 (+/- 0.02)
