<a href="https://colab.research.google.com/github/ZimingY/bert_for_sentiment/blob/master/bert_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# take a sentence, classify as 1 (positive) or 0 (negative)

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 2.8MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 13.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 20.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.38-cp36-none-any.whl size=884629 sha256=2d5e91681d8f380a

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from keras.preprocessing import sequence
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


** load the data set **

In [0]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', \
                 delimiter = '\t', header = None)

In [5]:
df.head(2)

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0


In [0]:
egs = df[-10:]
df = df[:2000]


In [7]:
df[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

## load the transformer

In [0]:
model_class = ppb.DistilBertModel
tokenizer_class = ppb.DistilBertTokenizer
pretrained_weights = 'distilbert-base-uncased'

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)


In [0]:
# add the [CLS] [SEP] and encode each word
tokenized = df[0].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [10]:
print(tokenized.shape)
tokenized.head(3)

(2000,)


0    [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1    [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2    [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
Name: 0, dtype: object

In [0]:
# padding, so that BERT can process all examples at once as one batch
max_len = tokenized.apply(lambda x: len(x)).max()
padded = sequence.pad_sequences(tokenized.values, maxlen = max_len,padding = 'post')

In [12]:
padded

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]], dtype=int32)

In [13]:
np.array(padded).shape

(2000, 59)

In [14]:
# mask ignore the padding we added
attention_mask = np.where(padded != 0,1,0)
attention_mask.shape

(2000, 59)

In [0]:
input_ids = torch.tensor(padded).to(torch.int64)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [16]:
last_hidden_states[0].shape

torch.Size([2000, 59, 768])

In [0]:
# only take [CLS] sentence embedding
features = last_hidden_states[0][:,0,:].numpy()

In [0]:
labels = df[1]

In [0]:
# training testing split


In [0]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [22]:
print(train_features.shape)
print(test_features.shape)
print(train_labels.shape)

(1500, 768)
(500, 768)
(1500,)


In [27]:
# logistic regression model
parameters = {'C': np.linspace(0.0001, 100, 100)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print(grid_search.best_params_)
print(grid_search.best_score_)



{'C': 1.0102}
0.8400000000000001


In [30]:
lr_clf = LogisticRegression(C = 1.0102)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0102, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
lr_clf.score(test_features, test_labels)

0.84

In [55]:
for i in egs[0]:
  print(i)
  print('\n')

` it 's painful to watch witherspoon 's talents wasting away inside unnecessary films like legally blonde and sweet home abomination , i mean , alabama '


it 's absolutely amazing how first time director kevin donovan managed to find something new to add to the canon of chan


it 's as raw and action packed an experience as a ringside seat at a tough man contest


to me , it sounds like a cruel deception carried out by men of marginal intelligence , with reactionary ideas about women and a total lack of empathy


you wo n't have any trouble getting kids to eat up these veggies


too bland and fustily tasteful to be truly prurient


it does n't work as either


this one aims for the toilet and scores a direct hit


in the name of an allegedly inspiring and easily marketable flick , the emperor 's club turns a blind eye to the very history it pretends to teach


the movie is undone by a filmmaking methodology that 's just experimental enough to alienate the mainstream audience while rin

In [56]:
print(egs[1])

1990    0
1991    1
1992    1
1993    0
1994    1
1995    0
1996    0
1997    0
1998    0
1999    0
Name: 1, dtype: int64


In [57]:
tokenized_egs = egs[0].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
padded_egs = sequence.pad_sequences(tokenized_egs.values, maxlen = max_len,padding = 'post')
egs_ids = torch.tensor(padded_egs).to(torch.int64)
egs_mask = np.where(padded_egs != 0,1,0)
egs_mask = torch.tensor(egs_mask)
with torch.no_grad():
    egs_hidden_states = model(egs_ids, attention_mask=egs_mask)
egs_features = egs_hidden_states[0][:,0,:].numpy()
lr_clf.predict(egs_features)

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 0])