In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset


comet_ml is installed but `COMET_API_KEY` is not set.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhangkejia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load the dataset
data_path = '../datasets/learning-agency-lab-automated-essay-scoring-2/train.csv'
test_path = '../datasets/learning-agency-lab-automated-essay-scoring-2/test.csv'
train_data = pd.read_csv(data_path)
test_data = pd.read_csv(test_path)
train_data.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [3]:
unique_labels = train_data['score'].nunique()
print(f" {unique_labels}  unique lable")


 6  unique lable


In [4]:
# text preprocessing
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = text.lower() # lowercase text
    text = text.replace('\n', ' ') # remove new line
    text = text.replace('&nbsp;', ' ') # remove html space
    return text

train_data['full_text'] = train_data['full_text'].apply(clean_text)
test_data['full_text'] = test_data['full_text'].apply(clean_text)

train_data.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,many people have car where they live the thing...,3
1,000fe60,i am a scientist at nasa that is discussing th...,3
2,001ab80,people always wish they had the same technolog...,4
3,001bdc0,we all heard about venus the planet without al...,4
4,002ba53,dear state senator this is a letter to argue ...,3


In [5]:
# Tokenization
stop_words = set(stopwords.words('english'))

def tokenize(text):
    tokens = text.split()
    # filter out the stopwords
    tokens = [word for word in tokens if word not in stop_words]
    processed_tokens = ' '.join(tokens)
    return processed_tokens

train_data['full_text'] = train_data['full_text'].apply(tokenize)
test_data['full_text'] = test_data['full_text'].apply(tokenize)
train_data['score'] = train_data['score'] - 1
train_data.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,many people car live thing dont know use car a...,2
1,000fe60,scientist nasa discussing face mars explaining...,2
2,001ab80,people always wish technology seen movies best...,3
3,001bdc0,heard venus planet without almost oxygen earth...,3
4,002ba53,dear state senator letter argue favor keeping ...,2


In [6]:
train_df, valid_df = train_test_split(train_data, test_size=0.2, random_state=42)
train_df = train_df.drop(labels='essay_id', axis=1)
valid_df = valid_df.drop(labels='essay_id', axis=1)
train_df.shape, valid_df.shape

((13845, 2), (3462, 2))

In [7]:
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
for param in model.bert.parameters():
    param.requires_grad = False

for param in model.bert.encoder.layer[-2:].parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

In [10]:
def preprocess_function(essay):
    # Tokenize the texts
    encoding = tokenizer(essay['full_text'], truncation=True, padding='max_length', max_length=128)
    # Add labels to the encoding
    encoding['labels'] = essay['score']
    return encoding

train_dataset = train_dataset.map(preprocess_function)
test_dataset = valid_dataset.map(preprocess_function)

  0%|          | 0/13845 [00:00<?, ?ex/s]

  0%|          | 0/3462 [00:00<?, ?ex/s]

In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [13]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: score, __index_level_0__, full_text. If score, __index_level_0__, full_text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13845
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2598
  Number of trainable parameters = 14180358


Epoch,Training Loss,Validation Loss


[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

In [31]:
results = trainer.evaluate()
print(results)

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, score, full_text. If __index_level_0__, score, full_text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 0
  Batch size = 16


AttributeError: 'NotebookTrainingTracker' object has no attribute 'value'