In [12]:
import pandas as pd

all_tweets_df = pd.read_csv('reduced_set')
all_tweets_df = all_tweets_df[:1000]
all_tweets_df['label'] = all_tweets_df['label'].replace({2:1})

In [13]:
from sklearn.model_selection import train_test_split

# Training = 0.7, validation = 0.1, test = 0.2
training_tweets, temp_tweets, training_labels, temp_labels = train_test_split(list(all_tweets_df['text']),list(all_tweets_df['label']), test_size=0.3, random_state=123)
val_tweets, test_tweets, val_labels, test_labels = train_test_split(temp_tweets, temp_labels, test_size=(1/3), random_state=123)

In [14]:
from transformers import ElectraTokenizer

tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
# tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
def tokenize_function(dataset):
    model_inputs = tokenizer(dataset['text'], padding="max_length", truncation=True, max_length=100)
    return model_inputs


loading file https://huggingface.co/google/electra-small-discriminator/resolve/main/vocab.txt from cache at /Users/brookegrantham/.cache/huggingface/transformers/ece45ade3e01224cf31fed8e183b306d17b84e8abd415363474cfe72274f7814.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/google/electra-small-discriminator/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/google/electra-small-discriminator/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/google/electra-small-discriminator/resolve/main/tokenizer_config.json from cache at /Users/brookegrantham/.cache/huggingface/transformers/8b3aea9f7242b3d19268df5b1bfed8f66e08671a72ac0809ada08e5ef1adc592.19eda9a6da5fb0e52a45200c95876729561dde16a69b9116953af6edca1d1e92
loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /Users/brookegrantham/.cache/hug

In [15]:
train = {'text': training_tweets, 'label': training_labels}
val = {'text': val_tweets, 'label': val_labels}
test = {'text': test_tweets, 'label': test_labels}

In [16]:
from datasets import Dataset

train_dataset = Dataset.from_dict(train)
val_dataset = Dataset.from_dict(val)
test_dataset = Dataset.from_dict(test)

Issue: There was NaN in the dataset not allowing it to be transfered to a Dataset type, thus checking the data condition in rough cells below.

In [17]:
# integers = [elm for elm in all_tweets_df['condition'] if not (isinstance(elm, int))]
# print(len(integers))

In [18]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="transformer_checkpoints",
    num_train_epochs=1,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
from transformers import Trainer, ElectraForSequenceClassification

model = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=2)

for param in model.electra.parameters():
    param.requires_grad = False


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /Users/brookegrantham/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.18.0",
  "type_vocab_size": 2

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=88, training_loss=0.696174144744873, metrics={'train_runtime': 25.9765, 'train_samples_per_second': 26.947, 'train_steps_per_second': 3.388, 'total_flos': 4022216520000.0, 'train_loss': 0.696174144744873, 'epoch': 1.0})

In [21]:
import numpy as np
import torch

def predict_nn(trained_model, test_dataset):

    output = trained_model(attention_mask=torch.tensor(test_dataset["attention_mask"]), input_ids=torch.tensor(test_dataset["input_ids"]))

    pred_labs = np.argmax(output["logits"].detach().numpy(), axis=1)

    gold_labs = test_dataset["label"]

    return gold_labs, pred_labs

gold_labs, pred_labs = predict_nn(model, test_dataset)

In [22]:
from sklearn.metrics import accuracy_score, f1_score
print("ELECTRA on the John Hopkins Twitter dataset:")
print(f'The accuracy score is {accuracy_score(pred_labs, gold_labs)}')
print(f'The f1-score is {f1_score(pred_labs,gold_labs)}')

ELECTRA on the John Hopkins Twitter dataset:
The accuracy score is 0.44
The f1-score is 0.125


In [23]:
pd.DataFrame(({'pred':pred_labs, 'gold':gold_labs, 'text':test_tweets}))

Unnamed: 0,pred,gold,text
0,0,0,Ive lost count of how many times my brother ha...
1,1,1,RT @pj3l408vwLgS3: Pittsburgh! Unbelievable sh...
2,0,0,@cyGZzOhKUEoRJO I'm going to up my exercises t...
3,0,0,@d_GElxXvY4 Please sign this Demi twitition ht...
4,0,1,Planning a 9/11 Tribute for @m91NviCM 530pm Sw...
...,...,...,...
95,0,1,@pNBLe0LqUd @rF_LpOVPx3M4D yes it was pretty a...
96,0,1,Woman Unaware She's Only Person On Acid At Jam...
97,0,1,@rjOa7pJ @b7HDUHsXQsL @lzgL6SpwuYm5ED ohhh who...
98,0,0,Pls keep the donations coming and help us to s...
