<a href="https://colab.research.google.com/github/alohia/pytorch_playground/blob/master/03_transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -U transformers sklearn torchtext

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import transformers as ppb
import torch
from tqdm import tqdm_notebook as tn
import pickle
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import warnings
warnings.filterwarnings("ignore")

In [0]:
df = pd.read_csv('https://www.dropbox.com/s/1kz3wkni6tyns8a/yelp_train.csv?dl=1')

In [4]:
df.shape

(50000, 2)

In [0]:
sample = df[:10000]

In [6]:
sample.head()

Unnamed: 0,y,text
0,2,With such a highly esteemed name for a restaur...
1,4,I came here on American Thanksgiving. Had a sm...
2,3,I am going to agree with the review of Juddi L...
3,3,"Meh. These are nice but not spectacular, howe..."
4,2,One thing my wife and I try to avoid on date n...


In [7]:
sample['y'].value_counts()

5    216
2    202
3    195
1    195
4    192
Name: y, dtype: int64

In [0]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [0]:
tokenized = sample['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

In [0]:
max_len = max(tokenized.apply(len))

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [11]:
padded.shape

(1000, 512)

In [12]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1000, 512)

In [0]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [0]:
padded = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

In [0]:
train_ds = TensorDataset(padded, attention_mask, torch.tensor(sample['y']))
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

In [16]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
model = model.to(device)
ftrs = []
labs = []
for i, m, y in tn(train_dl):
    input_ids = i.to(device)
    attention_mask = m.to(device)
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)[0]
        ftrs.append(last_hidden_states[:,0,:].cpu().numpy())
        labs.append(y.numpy())

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [0]:
features = np.concatenate(ftrs)
np.save('bert_features.npy', features)

In [0]:
labels = np.concatenate(labs)
np.save('bert_labels.npy', labels)

In [0]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=42, stratify=labels)

In [20]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters, n_jobs=-1, cv=5)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)



best parameters:  {'C': 5.263252631578947}


In [21]:
lr_clf = LogisticRegression(C=grid_search.best_params_['C'])
lr_clf.fit(train_features, train_labels)



LogisticRegression(C=5.263252631578947, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
lr_clf.score(train_features, train_labels)

0.9666666666666667

In [23]:
lr_clf.score(test_features, test_labels)

0.472

In [0]:
from sklearn.svm import SVC

In [25]:
parameters = {'C': np.linspace(0.001, 100, 20)}
grid_search = GridSearchCV(SVC(), parameters, n_jobs=-1, cv=5)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)



best parameters:  {'C': 31.579631578947367}


In [26]:
svc_clf = SVC(C=grid_search.best_params_['C'])
svc_clf.fit(train_features, train_labels)



SVC(C=31.579631578947367, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [27]:
svc_clf.score(train_features, train_labels)

0.6946666666666667

In [28]:
svc_clf.score(test_features, test_labels)

0.5

## Use Glove embeddings

In [0]:
from torchtext.vocab import GloVe

In [0]:
glove = GloVe(name = '6B', dim = 300)

In [0]:
np.save('embed.npy', glove.vectors.numpy())

In [0]:
pickle.dump(glove.stoi, open('stoi.pkl', 'wb'))

In [0]:
embeddings = np.load('embed.npy')
word_to_indx = pickle.load(open('stoi.pkl', 'rb'))

In [0]:
vocab_size, emb_sz = embeddings.shape

In [35]:
vocab_size, emb_sz

(400000, 300)

In [0]:
svm_df = sample.copy()
svm_df['text'] = svm_df.text.str.lower()
svm_df['embs'] = svm_df.text.apply(lambda x:np.array([embeddings[word_to_indx.get(word, 0)] for word in x.split()]).mean(0))

In [0]:
features = np.stack(svm_df['embs'].values)
np.save('svm_features.npy', features)

In [0]:
labels = np.stack(svm_df['y'].values)
np.save('svm_labels.npy', labels)

In [39]:
features.shape, labels.shape

((1000, 300), (1000,))

In [0]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=42, stratify=labels)

In [0]:
from sklearn.svm import SVC

In [42]:
parameters = {'C': np.linspace(0.001, 100, 20)}
grid_search = GridSearchCV(SVC(), parameters, n_jobs=-1, cv=5)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)



best parameters:  {'C': 89.4737894736842}


In [47]:
svc_clf = SVC(C=grid_search.best_params_['C'])
svc_clf.fit(train_features, train_labels)

SVC(C=89.4737894736842, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [44]:
svc_clf.score(train_features, train_labels)

0.5

In [45]:
svc_clf.score(test_features, test_labels)

0.36