<a href="https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/distilbert_masked.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/f9/51824e40f0a23a49eab4fcaa45c1c797cbf9761adedd0b558dab7c958b34/transformers-2.1.1-py3-none-any.whl (311kB)
[K     |█                               | 10kB 19.7MB/s eta 0:00:01[K     |██                              | 20kB 2.1MB/s eta 0:00:01[K     |███▏                            | 30kB 3.2MB/s eta 0:00:01[K     |████▏                           | 40kB 2.1MB/s eta 0:00:01[K     |█████▎                          | 51kB 2.5MB/s eta 0:00:01[K     |██████▎                         | 61kB 3.0MB/s eta 0:00:01[K     |███████▍                        | 71kB 3.5MB/s eta 0:00:01[K     |████████▍                       | 81kB 4.0MB/s eta 0:00:01[K     |█████████▌                      | 92kB 4.4MB/s eta 0:00:01[K     |██████████▌                     | 102kB 3.4MB/s eta 0:00:01[K     |███████████▋                    | 112kB 3.4MB/s eta 0:00:01[K     |████████████▋                   | 122kB 3.4M

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [0]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [0]:
batch_1 = df[:2000]

In [6]:
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [7]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')


# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

100%|██████████| 231508/231508 [00:00<00:00, 2645593.78B/s]
100%|██████████| 492/492 [00:00<00:00, 163453.27B/s]
100%|██████████| 267967963/267967963 [00:03<00:00, 71733827.75B/s]


In [0]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [9]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
max_len

59

In [0]:
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [11]:
np.array(padded).shape

(2000, 59)

In [25]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

In [0]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [0]:
features = last_hidden_states[0][:,0,:].numpy()

In [0]:
labels = batch_1[1]

In [0]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.8286666666666667


In [65]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)

0.818

In [70]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.528 (+/- 0.02)
