In [1]:
!pip3 install /kaggle/input/autocorrect/autocorrect-2.6.1.tar
import pandas as pd
import transformers
from transformers import DebertaV2TokenizerFast, DebertaV2ForSequenceClassification
import torch
from torch import optim
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn import MSELoss
import numpy as np
import random 
import timeit
from tqdm import tqdm
import autocorrect

Processing /kaggle/input/autocorrect/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25ldone
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=fb6d4fac21aaa2a51050791d87bcddc117cdb9ef456a69827f3b8f7f50079c5e
  Stored in directory: /root/.cache/pip/wheels/db/69/42/0fb0421d2fe70d195a04665edc760cfe5fd341d7bb8d8e0aaa
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1




In [2]:
RANDOM_SEED = 42
MODEL_PATH = "/kaggle/input/debertav3base"
MAX_LENGTH = 512
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
EPOCHS = 2

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = "cuda" if torch.cuda.is_available() else "cpu"
transformers.utils.logging.set_verbosity_error()
spell = autocorrect.Speller(lang="en", fast=True)

In [3]:
train_summary_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
train_prompt_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
test_summary_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")
test_prompt_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")

In [4]:
train_summary_df["text"] = train_summary_df["text"].apply(lambda x: spell(x))
train_summary_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaoh these people wer...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [5]:
train_prompt_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [6]:
test_summary_df["text"] = test_summary_df["text"].apply(lambda x: spell(x))
test_summary_df.head()

Unnamed: 0,student_id,prompt_id,text
0,000000ffffff,abc123,Example text 1
1,111111eeeeee,def789,Example text 2
2,222222cccccc,abc123,Example text 3
3,333333dddddd,def789,Example text 4


In [7]:
test_prompt_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,abc123,Summarize...,Example Title 1,Heading\nText...
1,def789,Summarize...,Example Title 2,Heading\nText...


In [8]:
submission_df.head()

Unnamed: 0,student_id,content,wording
0,000000ffffff,0.0,0.0
1,111111eeeeee,0.0,0.0
2,222222cccccc,0.0,0.0
3,333333dddddd,0.0,0.0


In [9]:
tokenizer = DebertaV2TokenizerFast.from_pretrained(MODEL_PATH)
model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=2).to(device)



In [10]:
train_df = train_prompt_df.merge(train_summary_df, on="prompt_id")
train_df["inputs"] = train_df["prompt_question"] + " " + train_df["prompt_title"] + " " + tokenizer.sep_token + train_df["text"]
train_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,inputs
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,Summarize at least 3 elements of an ideal trag...
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,Summarize at least 3 elements of an ideal trag...
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,Summarize at least 3 elements of an ideal trag...
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471,Summarize at least 3 elements of an ideal trag...
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,Summarize at least 3 elements of an ideal trag...


In [11]:
train_df["inputs"][0]

'Summarize at least 3 elements of an ideal tragedy, as described by Aristotle. On Tragedy [SEP]1 element of an ideal tragedy is that it should be arranged on a complex plan.  Another element of an ideal tragedy is that it should only have one main issue. The last element of an ideal tragedy is that it should have a double thread plot and an opposite catastrophe for both good and bad.'

In [12]:
test_df = test_prompt_df.merge(test_summary_df, on="prompt_id")
test_df["inputs"] = test_df["prompt_question"] + " " + test_df["prompt_title"] + " " + tokenizer.sep_token + test_df["text"]
test_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,inputs
0,abc123,Summarize...,Example Title 1,Heading\nText...,000000ffffff,Example text 1,Summarize... Example Title 1 [SEP]Example text 1
1,abc123,Summarize...,Example Title 1,Heading\nText...,222222cccccc,Example text 3,Summarize... Example Title 1 [SEP]Example text 3
2,def789,Summarize...,Example Title 2,Heading\nText...,111111eeeeee,Example text 2,Summarize... Example Title 2 [SEP]Example text 2
3,def789,Summarize...,Example Title 2,Heading\nText...,333333dddddd,Example text 4,Summarize... Example Title 2 [SEP]Example text 4


In [13]:
max_length = train_df["inputs"].apply(lambda x: len(x)).max()
print(max_length)

4151


In [14]:
s = train_df["inputs"].str.len()
s.describe()

count    7165.000000
mean      562.672994
std       318.629232
min       219.000000
25%       355.000000
50%       469.000000
75%       667.000000
max      4151.000000
Name: inputs, dtype: float64

In [15]:
class SummaryTrainDataset(Dataset):
    def __init__(self, inputs, content, wording, tokenizer):
        self.scores = torch.tensor([list(x) for x in zip(content, wording)])
        self.encodings = tokenizer(inputs, padding=True, truncation=True, max_length=MAX_LENGTH)
        
    def __len__(self):
        return len(self.scores)
    
    def __getitem__(self, idx):
        out_dic = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        out_dic["scores"] = self.scores[idx]
        return out_dic
    
class SummarySubmitDataset(Dataset):
    def __init__(self, inputs, ids, tokenizer):
        self.ids = ids
        self.encodings = tokenizer(inputs, padding=True, truncation=True, max_length=MAX_LENGTH)
        
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        out_dic = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        out_dic["ids"] = self.ids[idx]
        return out_dic

In [16]:
dataset = SummaryTrainDataset(train_df["inputs"].to_list(), train_df["content"].to_list(), train_df["wording"].to_list(), tokenizer)
print("-"*30)
print(len(dataset))
print(dataset[0])
print("-"*30)

test_dataset = SummarySubmitDataset(test_df["inputs"].to_list(), test_df["student_id"].to_list(), tokenizer)
print(len(test_dataset))
print(test_dataset[0])
print("-"*30)

------------------------------
7165
{'input_ids': tensor([     1, 105982,    288,    668,    404,   2019,    265,    299,   1949,
          8948,    261,    283,   1897,    293,  26446,    260,    589,  56195,
             2,    376,   3036,    265,    299,   1949,   8948,    269,    272,
           278,    403,    282,   6128,    277,    266,   1739,    741,    260,
          1811,   3036,    265,    299,   1949,   8948,    269,    272,    278,
           403,    364,    286,    311,    872,    889,    260,    279,    437,
          3036,    265,    299,   1949,   8948,    269,    272,    278,    403,
           286,    266,   1664,   3676,   4278,    263,    299,   3680,  21419,
           270,    462,    397,    263,    966,    260,      2,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0

In [17]:
generator = torch.Generator().manual_seed(RANDOM_SEED)
train_dataset, val_dataset = random_split(dataset, [0.9, 0.1], generator=generator)

In [18]:
train_dataloader = DataLoader(dataset=train_dataset,
                             batch_size=BATCH_SIZE,
                             shuffle=True)

val_dataloader = DataLoader(dataset=val_dataset,
                             batch_size=BATCH_SIZE,
                             shuffle=True)

test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=BATCH_SIZE,
                             shuffle=False)

In [19]:
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = MSELoss(reduction="mean")

start = timeit.default_timer()
for epoch in tqdm(range(EPOCHS), position=0, leave=True):
    model.train()
    train_running_loss = 0
    for idx, sample in enumerate(tqdm(train_dataloader, position=0, leave=True)):
        input_ids = sample["input_ids"].to(device)
        attention_mask = sample["attention_mask"].to(device)
        targets = sample["scores"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(targets, outputs["logits"])
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_running_loss += loss.item()
    train_loss = train_running_loss / (idx + 1)
    
    model.eval()
    val_running_loss = 0
    with torch.no_grad():
        for idx, sample in enumerate(tqdm(val_dataloader, position=0, leave=True)):
            input_ids = sample["input_ids"].to(device)
            attention_mask = sample["attention_mask"].to(device)
            targets = sample["scores"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            loss = criterion(targets, outputs["logits"])
            
            val_running_loss += loss.item()
        val_loss = val_running_loss / (idx + 1)
        
    print("-"*30)
    print(f"Train Loss EPOCH {epoch+1}: {train_loss:.4f}")
    print(f"Valid Loss EPOCH {epoch+1}: {val_loss:.4f}")
    print("-"*30)
stop = timeit.default_timer()
print(f"Training Time: {stop-start:.2f}s")

100%|██████████| 807/807 [08:39<00:00,  1.55it/s]
100%|██████████| 90/90 [00:19<00:00,  4.71it/s]
 50%|█████     | 1/2 [08:58<08:58, 538.79s/it]

------------------------------
Train Loss EPOCH 1: 0.3718
Valid Loss EPOCH 1: 0.2605
------------------------------


100%|██████████| 807/807 [08:38<00:00,  1.56it/s]
100%|██████████| 90/90 [00:19<00:00,  4.71it/s]
100%|██████████| 2/2 [17:55<00:00, 538.00s/it]

------------------------------
Train Loss EPOCH 2: 0.2323
Valid Loss EPOCH 2: 0.2414
------------------------------
Training Time: 1076.00s





In [20]:
torch.cuda.empty_cache()

In [22]:
contents = []
wordings = []
ids = []
model.eval()
with torch.no_grad():
    for idx, sample in enumerate(tqdm(test_dataloader, position=0, leave=True)):
        input_ids = sample["input_ids"].to(device)
        attention_mask = sample["attention_mask"].to(device)
        ids.extend(sample["ids"])
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)["logits"]
        
        contents.extend([float(i) for i in outputs[:,0]])
        wordings.extend([float(i) for i in outputs[:,1]])

100%|██████████| 1/1 [00:00<00:00, 40.31it/s]


In [24]:
submission_df = pd.DataFrame(list(zip(ids, contents, wordings)),
                            columns=["student_id", "content", "wording"])
submission_df.to_csv("submission.csv", index=False)
submission_df.head()

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.558995,-1.160506
1,222222cccccc,-1.591458,-1.214394
2,111111eeeeee,-1.568391,-1.172802
3,333333dddddd,-1.601415,-1.218777
