<a href="https://colab.research.google.com/github/alohia/pytorch_playground/blob/master/03_transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U transformers sklearn torchtext

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/70/1a/364556102943cacde1ee00fdcae3b1615b39e52649eddbf54953e5b144c9/transformers-2.2.1-py3-none-any.whl (364kB)
[K     |████████████████████████████████| 368kB 6.4MB/s 
[?25hRequirement already up-to-date: sklearn in /usr/local/lib/python3.6/dist-packages (0.0)
Collecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/43/94/929d6bd236a4fb5c435982a7eb9730b78dcd8659acf328fd2ef9de85f483/torchtext-0.4.0-py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 8.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/1f/8e/ed5364a06a9ba720fddd9820155cc57300d28f5f43a6fd7b7e817177e642/sacremoses-0.0.35.tar.gz (859kB)
[K     |████████████████████████████████| 860kB 52.4MB/s 
Collecting regex
[?25l  Downloading https://files.pythonhosted.org/packages/e3/8e/cbf2295643d7265e7883326fb4654e643bfc93b3a8a8274d8010a39d8804/regex-2019.11.1-c

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import transformers as ppb
import torch
from tqdm import tqdm_notebook as tn
import pickle
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import warnings
warnings.filterwarnings("ignore")

In [0]:
df = pd.read_csv('https://www.dropbox.com/s/1kz3wkni6tyns8a/yelp_train.csv?dl=1')

In [4]:
df.shape

(50000, 2)

In [0]:
sample = df[:2000]

In [6]:
sample.head()

Unnamed: 0,y,text
0,2,With such a highly esteemed name for a restaur...
1,4,I came here on American Thanksgiving. Had a sm...
2,3,I am going to agree with the review of Juddi L...
3,3,"Meh. These are nice but not spectacular, howe..."
4,2,One thing my wife and I try to avoid on date n...


In [7]:
sample['y'].value_counts()

5    412
2    408
4    397
1    395
3    388
Name: y, dtype: int64

In [8]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

100%|██████████| 231508/231508 [00:00<00:00, 923375.23B/s]
100%|██████████| 492/492 [00:00<00:00, 359011.41B/s]
100%|██████████| 267967963/267967963 [00:10<00:00, 25907371.27B/s]


In [0]:
tokenized = sample['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

In [0]:
max_len = max(tokenized.apply(len))

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [11]:
padded.shape

(2000, 512)

In [12]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 512)

In [0]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [0]:
padded = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

In [0]:
train_ds = TensorDataset(padded, attention_mask, torch.tensor(sample['y']))
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

In [16]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
model = model.to(device)
ftrs = []
labs = []
for i, m, y in tn(train_dl):
    input_ids = i.to(device)
    attention_mask = m.to(device)
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)[0]
        ftrs.append(last_hidden_states[:,0,:].cpu().numpy())
        labs.append(y.numpy())

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




In [0]:
features = np.concatenate(ftrs)
np.save('bert_features.npy', features)

In [0]:
labels = np.concatenate(labs)
np.save('bert_labels.npy', labels)

In [0]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=42, stratify=labels)

In [20]:
parameters = {'C': np.linspace(0.001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters, n_jobs=-1)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)

best parameters:  {'C': 5.264105263157894}


In [21]:
lr_clf = LogisticRegression(C=grid_search.best_params_['C'])
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=5.264105263157894, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
lr_clf.score(train_features, train_labels)

0.888

In [23]:
lr_clf.score(test_features, test_labels)

0.494

In [0]:
from sklearn.svm import SVC

In [25]:
parameters = {'C': np.linspace(0.001, 100, 20)}
grid_search = GridSearchCV(SVC(), parameters, n_jobs=-1)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)

best parameters:  {'C': 21.053421052631577}


In [26]:
svc_clf = SVC(C=grid_search.best_params_['C'])
svc_clf.fit(train_features, train_labels)

SVC(C=21.053421052631577, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [27]:
svc_clf.score(train_features, train_labels)

0.6413333333333333

In [28]:
svc_clf.score(test_features, test_labels)

0.516

## Use Glove embeddings

In [0]:
from torchtext.vocab import GloVe

In [30]:
glove = GloVe(name = '6B', dim = 300)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 399883/400000 [00:48<00:00, 8097.06it/s]

In [0]:
np.save('embed.npy', glove.vectors.numpy())

In [0]:
pickle.dump(glove.stoi, open('stoi.pkl', 'wb'))

In [33]:
embeddings = np.load('embed.npy')
word_to_indx = pickle.load(open('stoi.pkl', 'rb'))

100%|█████████▉| 399883/400000 [01:00<00:00, 8097.06it/s]

In [0]:
vocab_size, emb_sz = embeddings.shape

In [35]:
vocab_size, emb_sz

(400000, 300)

In [0]:
svm_df = sample.copy()
svm_df['text'] = svm_df.text.str.lower()
svm_df['embs'] = svm_df.text.apply(lambda x:np.array([embeddings[word_to_indx.get(word, 0)] for word in x.split()]).mean(0))

In [0]:
features = np.stack(svm_df['embs'].values)
np.save('svm_features.npy', features)

In [0]:
labels = np.stack(svm_df['y'].values)
np.save('svm_labels.npy', labels)

In [39]:
features.shape, labels.shape

((2000, 300), (2000,))

In [0]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=42, stratify=labels)

In [0]:
from sklearn.svm import SVC

In [42]:
parameters = {'C': np.linspace(0.001, 100, 20)}
grid_search = GridSearchCV(SVC(), parameters, n_jobs=-1)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)

best parameters:  {'C': 100.0}


In [43]:
svc_clf = SVC(C=grid_search.best_params_['C'])
svc_clf.fit(train_features, train_labels)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [44]:
svc_clf.score(train_features, train_labels)

0.5293333333333333

In [45]:
svc_clf.score(test_features, test_labels)

0.424