In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from transformers import pipeline, DistilBertTokenizerFast, DistilBertTokenizer

In [3]:
ds_train = pd.read_csv ('/content/drive/MyDrive/Colab Notebooks/ds_train.csv')
ds_test = pd.read_csv ('/content/drive/MyDrive/Colab Notebooks/ds_test.csv')

In [4]:
y_train = ds_train['label']
y_test = ds_test['label']

In [13]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [14]:
# Example sentence
sentence = 'Tokenize this sentence using DistilBERT.'

# Tokenize the sentence
tokens = tokenizer(sentence, return_tensors='pt')

# Retrieve the input_ids
input_ids = tokens['input_ids'].squeeze().tolist()

# Retrieve the attention mask
attention_mask = tokens['attention_mask'].squeeze().tolist()

# Print the results
print(f'Original sentence:\n{sentence}\n')
print(f'Input IDs:\n{input_ids}\n')
print(f'Attention mask:\n{attention_mask}')

Original sentence:
Tokenize this sentence using DistilBERT.

Input IDs:
[101, 19204, 4697, 2023, 6251, 2478, 4487, 16643, 23373, 1012, 102]

Attention mask:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [15]:
for input_id in input_ids:
      print(f'{input_id}: {tokenizer.decode([input_id])}')

101: [CLS]
19204: token
4697: ##ize
2023: this
6251: sentence
2478: using
4487: di
16643: ##sti
23373: ##lbert
1012: .
102: [SEP]


In [5]:
classifier = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [7]:
def predictions(text_list, classifier, max_length=512):
    preds = classifier(text_list, padding=True, truncation=True, max_length=max_length)
    pred_labels = [1 if pred['label'] == 'POSITIVE' else 0 for pred in preds]
    return pred_labels

In [18]:
train_p = predictions(ds_train['clean_text'].tolist(), classifier)
test_p = predictions(ds_test['clean_text'].tolist(), classifier)

In [20]:
print(train_p[:15])
print(test_p[:15])

[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [23]:
def ev_model(y_true, y_pred):
    accuracy_s = accuracy_score(y_test, y_pred)
    precision_s = precision_score(y_test, y_pred)
    recall_s = recall_score(y_test, y_pred)
    f1_s = f1_score(y_test, y_pred)
    confusion_m = confusion_matrix(y_test, y_pred)
    c_report = classification_report(y_test, y_pred)

    print(f'Accuracy: {accuracy_s}')
    print(f'Precision: {precision_s}')
    print(f'Recall: {recall_s}')
    print(f'F1 Score: {f1_s}')
    print('Confusion Matrix:')
    print(confusion_m)
    print('Classification Report:')
    print(c_report)

    return accuracy_s, precision_s, recall_s, f1_s, confusion_m, c_report

In [25]:
print("Train Dataset Evaluation:")
ev_model(y_train, train_p)

Train Dataset Evaluation:
Accuracy: 0.796
Precision: 0.9183627317955676
Recall: 0.64976
F1 Score: 0.7610569715142429
Confusion Matrix:
[[11778   722]
 [ 4378  8122]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.94      0.82     12500
           1       0.92      0.65      0.76     12500

    accuracy                           0.80     25000
   macro avg       0.82      0.80      0.79     25000
weighted avg       0.82      0.80      0.79     25000



(0.796,
 0.9183627317955676,
 0.64976,
 0.7610569715142429,
 array([[11778,   722],
        [ 4378,  8122]]),
 '              precision    recall  f1-score   support\n\n           0       0.73      0.94      0.82     12500\n           1       0.92      0.65      0.76     12500\n\n    accuracy                           0.80     25000\n   macro avg       0.82      0.80      0.79     25000\nweighted avg       0.82      0.80      0.79     25000\n')

In [26]:
print("Test Dataset Evaluation:")
ev_model(y_test, test_p)

Test Dataset Evaluation:
Accuracy: 0.79916
Precision: 0.9206884913938576
Recall: 0.65472
F1 Score: 0.7652531675160129
Confusion Matrix:
[[11795   705]
 [ 4316  8184]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.94      0.82     12500
           1       0.92      0.65      0.77     12500

    accuracy                           0.80     25000
   macro avg       0.83      0.80      0.79     25000
weighted avg       0.83      0.80      0.79     25000



(0.79916,
 0.9206884913938576,
 0.65472,
 0.7652531675160129,
 array([[11795,   705],
        [ 4316,  8184]]),
 '              precision    recall  f1-score   support\n\n           0       0.73      0.94      0.82     12500\n           1       0.92      0.65      0.77     12500\n\n    accuracy                           0.80     25000\n   macro avg       0.83      0.80      0.79     25000\nweighted avg       0.83      0.80      0.79     25000\n')