# Importul librariilor

In [None]:
!pip3 install torch torchvision

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import TFAutoModel

# Utils

In [None]:
# functie de mapare
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# Citirea datelor

In [None]:
url = 'https://github.com/berinde/curs-analiza-datelor-complexe/blob/main/data/input/1.input_data.csv?raw=True'
reviews = pd.read_csv(url)
print(reviews.shape)
reviews.head(2)

# Prepararea datelor

In [None]:
# transformarea din float in int a coloanei rating
reviews['rating'] = reviews['rating'].astype('int')

## Tokenizare

In [None]:
SEQ_LEN = 50  # padding senquence in 50 tokens

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=SEQ_LEN,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

# initialize two arrays for input tensors
Xids = np.zeros((len(reviews), SEQ_LEN))
Xmask = np.zeros((len(reviews), SEQ_LEN))

for i, sentence in enumerate(reviews['text']):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)
    if i % 10000 == 0:
        print(i) 

In [None]:
arr = reviews['rating'].values  # transformarea coloanei rating in array
labels = np.zeros((arr.size, arr.max()+1))  
labels[np.arange(arr.size), arr] = 1  

## Salvarea datelor tokenizate

In [None]:
with open('xids.npy', 'wb') as f:
    np.save(f, Xids)
with open('xmask.npy', 'wb') as f:
    np.save(f, Xmask)
with open('labels.npy', 'wb') as f:
    np.save(f, labels)

## Mapare

In [None]:
BATCH_SIZE = 32  # we will use batches of 32

# arrays dataset pentru tf
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

# folosind metoda de mapare mapam dataset-ul
dataset = dataset.map(map_func)

# shuffle data
dataset = dataset.shuffle(9500).batch(BATCH_SIZE)

## Train test split

In [None]:
DS_LEN = len([0 for batch in dataset])
SPLIT = 0.8  # 80-20 split

train = dataset.take(round(DS_LEN*SPLIT))  # primele 90% din batches
val = dataset.skip(round(DS_LEN*SPLIT))  # sarim peste primele 90% si pastram doar 10%

del dataset  # optional stergerea dataset-ului pentru mai multa memorie

# Model

In [None]:
# initializare model Bert
# bert = AutoModel.from_pretrained('bert-base-cased')
bert = TFAutoModel.from_pretrained('bert-base-cased')

input_ids = tf.keras.layers.Input(shape=(50,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50,), name='attention_mask', dtype='int32')

# embeddings
embeddings = bert(input_ids, attention_mask=mask)[0]

X = tf.keras.layers.LSTM(64)(embeddings)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
# y = tf.keras.layers.Dense(3, activation='softmax', name='outputs')(X)
y = tf.keras.layers.Dense(6, activation='softmax', name='outputs')(X)

# definireal layerelor de input si output
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# freeze the BERT layer - otherwise we will be training 100M+ parameters...
model.layers[2].trainable = False

## Training

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.CategoricalCrossentropy()  # categorical = one-hot
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

history = model.fit(train, validation_data=val, epochs=1)

In [None]:
# evaluation
loss, accuracy = model.evaluate(val)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

# Predict

In [None]:
examples = [
    'this is such an amazing meal!',  
    'The meal was great!',
    'The dinner was meh.',
    'The lunch was okish.',
    'The meal was terrible...'
]

label=[5,4,1,2,1]

arr = np.array(label)
label = np.zeros((arr.size, arr.max()+1))  
label[np.arange(arr.size), arr] = 1  

In [None]:
Xids = np.zeros((len(examples), SEQ_LEN))
Xmask = np.zeros((len(examples), SEQ_LEN))

for i, sentence in enumerate(examples):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)
    if i % 10000 == 0:
        print(i)

In [None]:
# arrays dataset pentru tf
dataset_pred = tf.data.Dataset.from_tensor_slices((Xids, Xmask, label))

# folosind metoda de mapare mapam dataset-ul
dataset_pred = dataset_pred.map(map_func)

dataset_pred = dataset_pred.shuffle(9500).batch(BATCH_SIZE)

In [None]:
model.predict(dataset_pred)

In [None]:
# evaluation
loss, accuracy = model.evaluate(dataset_pred)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

## Simpler example

https://www.tensorflow.org/text/tutorials/classify_text_with_bert

## Transformer from scratch

https://www.kaggle.com/code/renaudmathieu/transformer-from-scratch