In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhangkejia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Load the dataset
data_path = '../datasets/learning-agency-lab-automated-essay-scoring-2/train.csv'
test_path = '../datasets/learning-agency-lab-automated-essay-scoring-2/test.csv'
train_data = pd.read_csv(data_path)
test_data = pd.read_csv(test_path)
train_data.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [17]:
unique_labels = train_data['score'].nunique()
print(f" {unique_labels}  unique lable")


 6  unique lable


In [18]:
# text preprocessing
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = text.lower() # lowercase text
    text = text.replace('\n', ' ') # remove new line
    text = text.replace('&nbsp;', ' ') # remove html space
    return text

train_data['full_text'] = train_data['full_text'].apply(clean_text)
test_data['full_text'] = test_data['full_text'].apply(clean_text)

train_data.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,many people have car where they live the thing...,3
1,000fe60,i am a scientist at nasa that is discussing th...,3
2,001ab80,people always wish they had the same technolog...,4
3,001bdc0,we all heard about venus the planet without al...,4
4,002ba53,dear state senator this is a letter to argue ...,3


In [19]:
# Tokenization
stop_words = set(stopwords.words('english'))

def tokenize(text):
    tokens = text.split()
    # filter out the stopwords
    tokens = [word for word in tokens if word not in stop_words]
    processed_tokens = ' '.join(tokens)
    return processed_tokens

train_data['full_text'] = train_data['full_text'].apply(tokenize)
test_data['full_text'] = test_data['full_text'].apply(tokenize)
train_data['score'] = train_data['score'] - 1
train_data.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,many people car live thing dont know use car a...,2
1,000fe60,scientist nasa discussing face mars explaining...,2
2,001ab80,people always wish technology seen movies best...,3
3,001bdc0,heard venus planet without almost oxygen earth...,3
4,002ba53,dear state senator letter argue favor keeping ...,2


In [20]:
train_df, valid_df = train_test_split(train_data, test_size=0.2, random_state=42)
train_df = train_df.drop(labels='essay_id', axis=1)
valid_df = valid_df.drop(labels='essay_id', axis=1)
train_df.shape, valid_df.shape

((13845, 2), (3462, 2))

In [21]:
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

loading file vocab.txt from cache at /Users/zhangkejia/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/zhangkejia/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/tokenizer_config.json
loading configuration file config.json from cache at /Users/zhangkejia/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "la

In [23]:
for param in model.bert.parameters():
    param.requires_grad = False

for param in model.bert.encoder.layer[-2:].parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

In [24]:
def preprocess_function(essay):
    # Tokenize the texts
    encoding = tokenizer(essay['full_text'], truncation=True, padding='max_length', max_length=128)
    # Add labels to the encoding
    encoding['labels'] = essay['score']
    return encoding

train_dataset = train_dataset.map(preprocess_function)
test_dataset = valid_dataset.map(preprocess_function)

  0%|          | 0/13845 [00:00<?, ?ex/s]

  0%|          | 0/3462 [00:00<?, ?ex/s]

In [25]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [27]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: score, __index_level_0__, full_text. If score, __index_level_0__, full_text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13845
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2598
  Number of trainable parameters = 14180358


Epoch,Training Loss,Validation Loss
1,1.402,No log
2,1.2202,No log
3,1.1789,No log


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: score, __index_level_0__, full_text. If score, __index_level_0__, full_text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 0
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding a

TrainOutput(global_step=2598, training_loss=1.2539230387792668, metrics={'train_runtime': 2959.227, 'train_samples_per_second': 14.036, 'train_steps_per_second': 0.878, 'total_flos': 2732177542049280.0, 'train_loss': 1.2539230387792668, 'epoch': 3.0})

In [28]:
results = trainer.evaluate()
print(results)

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: score, __index_level_0__, full_text. If score, __index_level_0__, full_text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 0
  Batch size = 16


{'eval_runtime': 0.0094, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 3.0}


In [29]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

Configuration saved in ./saved_model/config.json
Model weights saved in ./saved_model/pytorch_model.bin
tokenizer config file saved in ./saved_model/tokenizer_config.json
Special tokens file saved in ./saved_model/special_tokens_map.json


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [33]:
import torch

loaded_model = BertForSequenceClassification.from_pretrained('./saved_model')
loaded_tokenizer = BertTokenizer.from_pretrained('./saved_model')


test_encodings = loaded_tokenizer(test_data['full_text'].tolist(), truncation=True, padding=True, return_tensors="pt")

with torch.no_grad():
    outputs = loaded_model(**test_encodings)
    predictions = torch.argmax(outputs.logits, dim=-1)
    predictions = predictions + 1

print(predictions)

loading configuration file ./saved_model/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1

tensor([3, 3, 4])
