In [None]:
!pip install transformers datasets

In [None]:
!pip install transformers[sentencepiece]

In [None]:
from psutil import virtual_memory
from google.colab import files
import pandas as pd
import numpy as np
import io
import nltk
import re
import string
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from transformers import DataCollatorWithPadding
from transformers import TFAutoModel
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from datasets import Dataset
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam, AdamW
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [None]:
uploaded = files.upload()

Saving homomex_training.csv to homomex_training.csv


In [None]:
task1_df = pd.read_csv(io.BytesIO(uploaded['homomex_training.csv']))

In [None]:
le = LabelEncoder()
le.fit(task1_df['label'])

In [None]:
list(le.classes_)

['NP', 'P', nan]

In [None]:
SEED = 2

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(list(task1_df['tweets']), list(le.transform(task1_df['label'])), test_size=.2,
                                                                    shuffle=True, random_state=SEED)

In [None]:
print(len(train_texts))
print(len(val_texts))

5600
1400


In [None]:
checkpoint = "microsoft/mdeberta-v3-base" 
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
train_encodings['labels'] = train_labels
val_encodings['labels'] = val_labels

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [None]:
batch_size = 32
num_epochs = 5

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_dataset = Dataset.from_dict(train_encodings).to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

tf_validation_dataset = Dataset.from_dict(val_encodings).to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batch_size,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = AdamW(learning_rate=lr_scheduler)
loss=SparseCategoricalCrossentropy(from_logits=True)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3,
                                                             attention_probs_dropout_prob=0.1,
                                                             hidden_dropout_prob=0.2,
                                                             from_pt=True)

Downloading pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'deberta.embeddings.word_embeddings._weight', 'deberta.embeddings.position_embeddings._weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification f

In [None]:
model.compile(
    optimizer=opt,
    loss=loss,
    metrics=["accuracy"]
)

In [None]:
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=num_epochs
)

Epoch 1/5


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7c35925a50>

In [None]:
preds = model.predict(tf_validation_dataset)["logits"]
class_preds = np.argmax(preds, axis=1)



In [None]:
print(classification_report(class_preds, val_encodings['labels']))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88       872
           1       0.59      0.53      0.56       174
           2       0.83      0.88      0.85       354

    accuracy                           0.84      1400
   macro avg       0.77      0.76      0.77      1400
weighted avg       0.83      0.84      0.83      1400



In [None]:
confusion_matrix(class_preds, val_encodings['labels'])

array([[768,  56,  48],
       [ 66,  93,  15],
       [ 36,   8, 310]])

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("cybersyn/mdeberta-homomex-track1", from_pt=True)

tf_model.h5:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
tokenizer.push_to_hub("cybersyn/mdeberta-homomex-track1")

CommitInfo(commit_url='https://huggingface.co/cybersyn/mdeberta-homomex-track1/commit/3ce57fd2af119ea0060f0c1761bb86cc53f2ed82', commit_message='Upload tokenizer', commit_description='', oid='3ce57fd2af119ea0060f0c1761bb86cc53f2ed82', pr_url=None, pr_revision=None, pr_num=None)