In [None]:
!pip install transformers datasets

In [None]:
from psutil import virtual_memory
from google.colab import files
import pandas as pd
import numpy as np
import io
import nltk
import re
import string
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from transformers import DataCollatorWithPadding
from transformers import TFAutoModel
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from datasets import Dataset
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam, AdamW
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [None]:
uploaded = files.upload()

Saving homomex_training.csv to homomex_training.csv


In [None]:
task1_df = pd.read_csv(io.BytesIO(uploaded['homomex_training.csv']))

In [None]:
le = LabelEncoder()
le.fit(task1_df['label'])

In [None]:
list(le.classes_)

['NP', 'P', nan]

In [None]:
SEED = 2

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(list(task1_df['tweets']), list(le.transform(task1_df['label'])),
                                                                    test_size=.2, random_state=SEED, shuffle=True)

In [None]:
print(len(train_texts))
print(len(val_texts))

5600
1400


In [None]:
checkpoint = "dccuchile/bert-base-spanish-wwm-cased" 
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
train_encodings['labels'] = train_labels
val_encodings['labels'] = val_labels

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [None]:
batch_size = 4
num_epochs = 4

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_dataset = Dataset.from_dict(train_encodings).to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

tf_validation_dataset = Dataset.from_dict(val_encodings).to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batch_size,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = AdamW(learning_rate=lr_scheduler)
loss=SparseCategoricalCrossentropy(from_logits=True)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3,
                                                             attention_probs_dropout_prob=0.1,
                                                             hidden_dropout_prob=0.1,
                                                             from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.compile(
    optimizer=opt,
    loss=loss,
    metrics=["accuracy"]
)

In [None]:
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=num_epochs
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f9d00eb6fa0>

In [None]:
preds = model.predict(tf_validation_dataset)["logits"]
class_preds = np.argmax(preds, axis=1)



In [None]:
print(classification_report(class_preds, val_encodings['labels']))

              precision    recall  f1-score   support

           0       0.92      0.89      0.90       899
           1       0.57      0.65      0.61       138
           2       0.85      0.87      0.86       363

    accuracy                           0.86      1400
   macro avg       0.78      0.80      0.79      1400
weighted avg       0.87      0.86      0.86      1400

