In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, BertForSequenceClassification, BertTokenizer
import torch

In [27]:
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("mental/mental-bert-base-uncased")

Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental

In [84]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [37]:
# Code from the hugging face tutorial
def print_encoding(model_inputs, indent=4):
    indent_str = " " * indent
    print("{")
    for k, v in model_inputs.items():
        print(indent_str + k + ":")
        print(indent_str + indent_str + str(v))
    print("}")

In [47]:
inputs = "I am feeling fantastic!"
tokenized_inputs = tokenizer(inputs, return_tensors="pt")
outputs = model(**tokenized_inputs)

labels = ['NEGATIVE', 'POSITIVE']
prediction = torch.argmax(outputs.logits)


print("Input:")
print(inputs)
print()
print("Tokenized Inputs:")
print_encoding(tokenized_inputs)
print()
print("Model Outputs:")
print(outputs)
print()
print(f"The prediction is {labels[prediction]}")

Input:
I am feeling fantastic!

Tokenized Inputs:
{
    input_ids:
        tensor([[  101,  1045,  2572,  3110, 10392,   999,   102]])
    token_type_ids:
        tensor([[0, 0, 0, 0, 0, 0, 0]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1]])
}

Model Outputs:
SequenceClassifierOutput(loss=None, logits=tensor([[-0.0806, -0.0594]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

The prediction is POSITIVE


In [61]:
from datasets import load_dataset, DatasetDict

In [197]:
import csv
with open('total_labeled_nonan.csv', 'w', newline='') as writefile:
    with open('total_labeled_data.csv', newline='') as readfile:
        reader = csv.DictReader(readfile)
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow(['label', 'text'])
        index = 0
        for row in reader:
            if row['post'] != 'nan' and row['post'] != '':
                writer.writerow([index, row['labels'], row['post']])
                index += 1
            

In [222]:
# Load in reddit data
dataset = load_dataset('csv', data_files='total_labeled_nonan.csv')

Using custom data configuration default-0bdb209e43ebd589
Reusing dataset csv (/Users/rhettowen/.cache/huggingface/datasets/csv/default-0bdb209e43ebd589/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# for x in range(len(dataset['train']['text'])):
#     if dataset['train']['text'][x] is None:
#         print(x)

In [200]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 14394
    })
})


In [223]:
# Take 11000 random examples for train and 3000 validation
dataset = DatasetDict(
    train=dataset['train'].shuffle(seed=1111).select(range(11000)),
    val=dataset['train'].shuffle(seed=1111).select(range(11000, 14000)),
)

Loading cached shuffled indices for dataset at /Users/rhettowen/.cache/huggingface/datasets/csv/default-0bdb209e43ebd589/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-6cbb389e7b158010.arrow
Loading cached shuffled indices for dataset at /Users/rhettowen/.cache/huggingface/datasets/csv/default-0bdb209e43ebd589/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-6cbb389e7b158010.arrow


In [None]:
# for x in range(len(dataset['train']['text'])):
#     if dataset['train']['text'][x] is None:
#         print(x)

In [224]:
small_tokenized_dataset = dataset.map(
    lambda example: tokenizer(example['text'], padding=True, truncation=True),
    batched=True,
    batch_size=16
)

small_tokenized_dataset = small_tokenized_dataset.remove_columns(["text"])
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", "labels")
small_tokenized_dataset.set_format("torch")

  0%|          | 0/688 [00:00<?, ?ba/s]

  0%|          | 0/188 [00:00<?, ?ba/s]

In [216]:
# Tokenize the data
# HUGGING FACE ONE, NOT USING

# def preprocess_function(examples):
#     return tokenizer(examples["text"], truncation=True)

# tokenized_dataset = dataset.map(preprocess_function, batched=True)
# tokenized_train = dataset["train"].map(preprocess_function, batched=True)
# tokenized_val = dataset["val"].map(preprocess_function, batched=True)

Loading cached processed dataset at /Users/rhettowen/.cache/huggingface/datasets/csv/default-0bdb209e43ebd589/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-1669089c6295740a.arrow
Loading cached processed dataset at /Users/rhettowen/.cache/huggingface/datasets/csv/default-0bdb209e43ebd589/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-3e4e7f919a8abfef.arrow


In [None]:
# from transformers import DataCollatorWithPadding

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# data_collator(tokenized_dataset)

In [237]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_tokenized_dataset['train'], batch_size=16)
eval_dataloader = DataLoader(small_tokenized_dataset['val'], batch_size=16)

In [235]:
small_tokenized_dataset['train'][0:2]

{'labels': tensor([1, 1]),
 'input_ids': tensor([[ 101, 2005, 2033,  ...,    0,    0,    0],
         [ 101, 1045, 7078,  ..., 5342, 2009,  102]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]])}

In [234]:
# small_tokenized_dataset = small_tokenized_dataset.remove_columns(['__index_level_0__'])
# small_tokenized_dataset = small_tokenized_dataset.remove_columns(['token_type_ids'])

In [238]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm


model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

num_epochs = 3
num_training_steps = 3 * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

best_val_loss = float("inf")
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    # training
    model.train()
    for batch_i, batch in enumerate(train_dataloader):
        
        output = model(**batch)
        
        optimizer.zero_grad()
        output.loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
    
    # validation
    model.eval()
    for batch_i, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            output = model(**batch)
        loss += output.loss
    
    avg_val_loss = loss / len(eval_dataloader)
    print(f"Validation loss: {avg_val_loss}")
    if avg_val_loss < best_val_loss:
        print("Saving checkpoint!")
        best_val_loss = avg_val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss,
            },
            f"checkpoints/epoch_{epoch}.pt"
        )  

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /Users/rhettowen/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base

  0%|          | 0/2064 [00:00<?, ?it/s]

KeyboardInterrupt: 