In [1]:
from transformers import pipeline
import pandas as pd
import numpy as np
import gensim
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, SimpleRNN, Dense,Flatten,Dropout,GRU
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report
import gensim.downloader as api
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt




from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification,DistilBertTokenizerFast
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint

from transformers import pipeline
from tqdm import tqdm  
import tensorflow as tf
import random
seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../dataset_imdb_preprocessed.csv",index_col=0)
df

Unnamed: 0,sentiment,lemmatized_review
0,1,one reviewer ha mentioned watching 1 oz episod...
1,1,wonderful little production filming technique ...
2,1,thought wa wonderful way spend time hot summer...
3,0,basically family little boy jake think zombie ...
4,1,petter mattei love time money visually stunnin...
...,...,...
49995,1,thought movie right good job wa creative origi...
49996,0,bad plot bad dialogue bad acting idiotic direc...
49997,0,catholic taught parochial elementary school nu...
49998,0,going disagree previous comment side maltin on...


# Bert hugging face without fine tune

In [3]:
df = pd.read_csv("../dataset_imdb_preprocessed.csv", index_col=0)

X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_review'], df['sentiment'], test_size=0.2, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=seed)

classifier = pipeline(
    "sentiment-analysis", 
    model="distilbert-base-uncased-finetuned-sst-2-english", 
    tokenizer="distilbert-base-uncased-finetuned-sst-2-english"
)

X_test_list = X_test.astype(str).tolist() 

predictions = []
for text in tqdm(X_test_list, desc="Processing", unit="review"):
    pred = classifier(text, truncation=True, padding=True)
    predictions.append(pred[0]) 


y_pred = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]


f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"F1 Score στο test set: {f1:.4f}")
print(f"Precision στο test set: {precision:.4f}")
print(f"Recall στο test set: {recall:.4f}")
print(f"Accuracy στο test set: {accuracy:.4f}")

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Processing: 100%|██████████| 10000/10000 [59:39<00:00,  2.79review/s]    

F1 Score στο test set: 0.7625
Precision στο test set: 0.9109
Recall στο test set: 0.6557
Accuracy στο test set: 0.7942





# Bert hugging face with fine tune

In [3]:
df = pd.read_csv("../dataset_imdb_preprocessed.csv", index_col=0)

X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_review'], df['sentiment'], test_size=0.2, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=seed)



tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


for layer in model.layers[:-3]:  
    layer.trainable = False

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=512)


train_labels = np.array(y_train.tolist())
val_labels = np.array(y_val.tolist())


train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings),val_labels))


train_dataset = train_dataset.shuffle(1000).batch(8)
val_dataset = val_dataset.batch(64)



optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']


model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.summary()




early_stopping = EarlyStopping(
    monitor='val_loss',      
    patience=2,              
    restore_best_weights=True  
)


history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,
    callbacks=[early_stopping]  
)




test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)
test_labels = np.array(y_test.tolist())


test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),test_labels)).batch(64)


predictions = model.predict(test_dataset)
predicted_labels = np.argmax(predictions.logits, axis=-1)


accuracy = accuracy_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)


print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,010
Trainable params: 592,130
Non-trainable params: 66,362,880
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.8183
Recal