In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
!pip install datasets
from datasets import load_dataset
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

In [None]:
from transformers import AutoModel, DistilBertTokenizer

In [None]:
yelp = load_dataset('yelp_review_full')
# %%
yelp

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [None]:
train_ds=yelp['train'].select(range(1000))

In [None]:
model_name = 'distilbert-base-uncased'
device = 'cpu'


In [None]:
model=AutoModel.from_pretrained(model_name).to(device)
tokenizer=DistilBertTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
text='hello i am super man or superman ?'
encoded_txt=tokenizer(text,return_tensors='pt')
encoded_txt

{'input_ids': tensor([[  101,  7592,  1045,  2572,  3565,  2158,  2030, 10646,  1029,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
tokens=tokenizer.convert_ids_to_tokens(encoded_txt['input_ids'][0])
tokens

['[CLS]', 'hello', 'i', 'am', 'super', 'man', 'or', 'superman', '?', '[SEP]']

In [None]:
tokenizer.convert_tokens_to_string(tokens)

'[CLS] hello i am super man or superman ? [SEP]'

In [None]:
tokenizer.vocab_size

30522

In [None]:
max_context_length = tokenizer.model_max_length
max_context_length

512

In [None]:
def tokenize_text(batch):
  return tokenizer(batch['text'],return_tensors='pt',padding='max_length',truncation=True)

In [None]:
yelp_encoding = train_ds.map(tokenize_text,batched=True,batch_size=128)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
yelp_encoding.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
yelp_encoding

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [None]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [None]:
def get_last_hidden_state(batch):
    inputs = {k: v for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        # [:, 0] refers to CLS token for complete sentence representation
    return {'hidden_state': last_hidden_state[:, 0]}

In [None]:
yelp_hidden_states = yelp_encoding.map(get_last_hidden_state, batched=True, batch_size=128)  # will have additional column 'hidden_state'

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
yelp_hidden_states

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask', 'hidden_state'],
    num_rows: 1000
})

In [None]:
import joblib
joblib.dump(yelp_hidden_states, 'yelp_hidden_states.joblib')

['yelp_hidden_states.joblib']

In [None]:
yelp_hidden_states = joblib.load('/content/yelp_hidden_states.joblib')

In [None]:
cutoff = 800
X_train = np.array(yelp_hidden_states['hidden_state'][:cutoff])
y_train = np.array(yelp_hidden_states['label'][:cutoff])
X_test = np.array(yelp_hidden_states['hidden_state'][cutoff: ])
y_test = np.array(yelp_hidden_states['label'][cutoff: ])
print(f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}")
print(f"X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}")

X_train.shape: (800, 768), y_train.shape: (800,)
X_test.shape: (200, 768), y_test.shape: (200,)


In [None]:
dummy_model = DummyClassifier(strategy='most_frequent')
dummy_model.fit(X_train, y_train)
dummy_model.score(X_test, y_test)

0.165

In [None]:
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)
svm_model.score(X_test, y_test)

0.395

In [None]:
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.425

In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_model.score(X_test, y_test)

0.33

In [None]:
######## fine tuning model

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import joblib
import torch
from torch.nn.functional import cross_entropy
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.dummy import DummyClassifier

from transformers import AutoModelForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments

In [None]:
from datasets import DatasetDict

In [None]:
model_name = 'distilbert-base-uncased'
device = 'cuda'
num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_ds = yelp_hidden_states.select(range(0, 800))
eval_ds = yelp_hidden_states.select(range(800, 1000))
print(train_ds[0]['input_ids'].shape)
print(eval_ds[0]['input_ids'].shape)
print(yelp_hidden_states[800]['input_ids'].shape)

torch.Size([512])
torch.Size([512])
torch.Size([512])


In [None]:
yelp_ds_dict = DatasetDict({'train': train_ds, 'test':eval_ds})

In [None]:
batch_size = 8

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


In [None]:
device

'cuda'

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    learning_rate=2e-5,              # learning rate
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=batch_size,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    disable_tqdm=False,
    push_to_hub=False,
    save_strategy='epoch',
    log_level='error',
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs

)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=yelp_ds_dict['train'], eval_dataset=yelp_ds_dict['test'])
trainer.train()

NameError: name 'training_args' is not defined

In [None]:
trainer.evaluate()

In [None]:
preds = trainer.predict(yelp_ds_dict['test'])

In [None]:
preds.metrics

In [None]:
np.argmax(preds.predictions, axis=1)

In [None]:
true_classes = yelp_ds_dict['test']['label']
preds_classes = np.argmax(preds.predictions, axis=1)
conf_mat = confusion_matrix(true_classes, preds_classes)
sns.heatmap(conf_mat, annot=True)
# %% accuracy
accuracy_score(true_classes, preds_classes)

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(yelp_ds_dict['train']['label'], yelp_ds_dict['train']['label'])

In [None]:
dummy_clf.score(yelp_ds_dict['test']['label'], yelp_ds_dict['test']['label'])

NameError: name 'dummy_clf' is not defined

In [None]:
model_cpu = model.to('cpu')
#%% Inference
with torch.no_grad():
    outputs = model_cpu(yelp_ds_dict['test']['input_ids'], yelp_ds_dict['test']['attention_mask'])
#%% Loss calculation
pred_labels = torch.argmax(outputs.logits, dim=1)
loss = cross_entropy(outputs.logits, yelp_ds_dict['test']['label'], reduction='none')

NameError: name 'yelp_ds_dict' is not defined

In [None]:
df_individual_reviews = pd.DataFrame({'text': yelp_ds_dict['test']['text'], 'label': yelp_ds_dict['test']['label'], 'pred_label': pred_labels, 'loss': loss}).sort_values('loss', ascending=False).reset_index(drop=True)
# %%
df_individual_reviews

NameError: name 'yelp_ds_dict' is not defined

In [None]:
sns.lineplot(data=df_individual_reviews, x='label', y='loss')