In [1]:
import pandas as pd
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

To whom it may concern: I have implemented two types of models. The first model is regression with some preprocessing and data engineering, the second one is transformer-based. 

I have never properly deployed a model, so hope that I will learn it during the course :)

In [2]:
%env WANDB_DISABLED=true #for kaggle env

env: WANDB_DISABLED=true #for kaggle env


**DATA INVESTIGATION**

In [3]:
len(train_df)
train_df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [4]:
len(test_df)

7

In [5]:
train_df.isnull().sum()

id                   0
url_legal         2004
license           2004
excerpt              0
target               0
standard_error       0
dtype: int64

I will just drop 'url_legal' and 'license' columns

In [6]:
train_df.drop(['url_legal' , 'license'] , axis=1 , inplace = True)

In [7]:
train_df.head()

Unnamed: 0,id,excerpt,target,standard_error
0,c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845


**PREPROCESSING**

In [8]:
import re
import numpy as np

In [9]:
def preprocess(text):
    # removal of extra spaces
    regex_pat = re.compile(r'\s+')
    text = text.str.replace(regex_pat, ' ')
    
    # removal of punctuations and numbers
    punc_remove = text.str.replace("[^a-zA-Z]", " ")
    
    # remove whitespace with a single space
    new_text=punc_remove.str.replace(r'\s+', ' ')
    
    # remove leading and trailing whitespace
    new_text=new_text.str.replace(r'^\s+|\s+?$','')
    
    # replace normal numbers with numbr
    new_text=new_text.str.replace(r'\d+(\.\d+)?','numbr')
    
    # removal of capitalization
    text_lower = new_text.str.lower()
    
    # tokenizing
    tokenized_text = text_lower.apply(lambda x: x.split())
    
    for i in range(len(tokenized_text)):
        tokenized_text[i] = ' '.join(tokenized_text[i])
        texts_p= tokenized_text
    
    return texts_p

In [10]:
text = train_df.excerpt
preprocessed_text = preprocess(text)

train_df['preprocessed_excerpt'] = preprocessed_text

  import sys
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  


In [11]:
train_df.head()

Unnamed: 0,id,excerpt,target,standard_error,preprocessed_excerpt
0,c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009,when the young people returned to the ballroom...
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,all through dinner time mrs fayre was somewhat...
2,b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,as roger had predicted the snow departed as qu...
3,dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007,and outside before the palace a great garden w...
4,37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845,once upon a time there were three bears who li...


**DATA ENGINEERING**

In [12]:
train_df.describe()

Unnamed: 0,target,standard_error
count,2834.0,2834.0
mean,-0.959319,0.491435
std,1.033579,0.034818
min,-3.676268,0.0
25%,-1.69032,0.468543
50%,-0.91219,0.484721
75%,-0.20254,0.506268
max,1.71139,0.649671


In [13]:
def feature_engineering(df):
    def total_words(x):
        return len(x.split(" "))

    def total_unique_words(x):
        return len(np.unique(x.split(" ")))

    def total_charactors(x):
        x = x.replace(" ", "")
        return len(x)

    def total_sentence(x):
        x = x.replace("!", "[end]").replace("?", "[end]").replace(".", "[end]")
        return len(x.split("[end]"))

    df_ret = df[["id", "excerpt", "preprocessed_excerpt", "target", "standard_error"]].copy()
    excerpt = df["excerpt"].values
    df_ret["total_words"] = [total_words(x) for x in excerpt]
    df_ret["total_unique_words"] = [total_unique_words(x) for x in excerpt]
    df_ret["total_characters"] = [total_charactors(x) for x in excerpt]
    df_ret["total_sentence"] = [total_sentence(x) for x in excerpt]

    df_ret["div_sentence_characters"] = df_ret["total_sentence"] / df_ret["total_characters"]
    df_ret["div_sentence_words"] = df_ret["total_sentence"] / df_ret["total_words"]
    df_ret["div_characters_words"] = df_ret["total_characters"] / df_ret["total_words"]
    df_ret["div_words_unique_words"] = df_ret["total_words"] / df_ret["total_unique_words"]

    for i, word in enumerate(["!", "?", "(", ")", "'", '"', ";", ".", ","]):
        df_ret[f"count_word_special_{i}"] = [x.count(word) for x in excerpt]

    return df_ret.fillna(0)

In [14]:
fe_train_df = feature_engineering(train_df)

In [15]:
fe_train_df.head()

Unnamed: 0,id,excerpt,preprocessed_excerpt,target,standard_error,total_words,total_unique_words,total_characters,total_sentence,div_sentence_characters,...,div_words_unique_words,count_word_special_0,count_word_special_1,count_word_special_2,count_word_special_3,count_word_special_4,count_word_special_5,count_word_special_6,count_word_special_7,count_word_special_8
0,c12129c31,When the young people returned to the ballroom...,when the young people returned to the ballroom...,-0.340259,0.464009,174,112,819,12,0.014652,...,1.553571,0,0,0,0,0,0,0,11,14
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",all through dinner time mrs fayre was somewhat...,-0.315372,0.480805,164,123,774,18,0.023256,...,1.333333,5,2,0,0,3,12,0,10,24
2,b69ac6792,"As Roger had predicted, the snow departed as q...",as roger had predicted the snow departed as qu...,-0.580118,0.476676,162,124,747,13,0.017403,...,1.306452,1,0,0,0,4,10,2,11,17
3,dd1000b26,And outside before the palace a great garden w...,and outside before the palace a great garden w...,-1.054013,0.450007,163,117,747,6,0.008032,...,1.393162,0,0,0,0,0,0,2,5,23
4,37c1b32fb,Once upon a time there were Three Bears who li...,once upon a time there were three bears who li...,0.247197,0.510845,147,51,577,6,0.010399,...,2.882353,0,0,0,0,0,0,10,5,13


In [16]:
fe_train_df.drop(['excerpt'] , axis=1 , inplace = True)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=30000)
X = cv.fit_transform(fe_train_df['preprocessed_excerpt']).toarray()
y = fe_train_df.target

In [18]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [19]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

print(X_train.shape , X_test.shape)

(1983, 26209) (851, 26209)


In [21]:
import xgboost as xgb

model = xgb.XGBRegressor().fit(X_train , y_train)
pred = model.predict(X_test)
  

In [22]:
print(model.score(X_test , y_test))
print(mean_squared_error(y_test , pred))    

0.37454696759544004
0.6922623398877353


Used Resources for XGB model: 
https://www.kaggle.com/code/ammarabbasi1040/commonlit-simple-eda-fe-ml

https://www.kaggle.com/code/kurupical/191-192-202-228-251-253-268-288-278-final/notebook?scriptVersionId=69642056

## TRANSFORMER model

In [23]:
import pandas as pd
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

In [24]:
import transformers
transformers.__version__
# !python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

'4.26.1'

In [25]:
import numpy as np
from transformers import AutoTokenizer
import torch
import tqdm
import gc
import pickle
import random

Check wheither GPU is available or not:

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

Device:  cuda


Set up seed for replication results: 

In [27]:
def set_random_seed(seed, using_cuda = False): #but default using CPU
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if using_cuda:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.cuda.manual_seed(seed)
seed = 1
set_random_seed(seed)

In [28]:
train_df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [29]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [30]:
from tqdm import tqdm
length = []

for s in tqdm(train_df["excerpt"]):

    words = tokenizer.encode_plus(s)["input_ids"]
    length.append(len(words))

print(max(length))

100%|██████████| 2834/2834 [00:01<00:00, 1500.48it/s]

314





In [31]:
def preprocess_function(data):
    return tokenizer(data, padding='max_length', truncation=True, max_length=314) #I trained with default 512 nodes, just updated it at the end. 

In [32]:
train_df.drop(['id', 'url_legal', 'license', 'standard_error'] , axis=1 , inplace = True)

In [33]:
train_df

Unnamed: 0,excerpt,target
0,When the young people returned to the ballroom...,-0.340259
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372
2,"As Roger had predicted, the snow departed as q...",-0.580118
3,And outside before the palace a great garden w...,-1.054013
4,Once upon a time there were Three Bears who li...,0.247197
...,...,...
2829,When you think of dinosaurs and where they liv...,1.711390
2830,So what is a solid? Solids are usually hard be...,0.189476
2831,The second state of matter we will discuss is ...,0.255209
2832,Solids are shapes that you can actually touch....,-0.215279


In [34]:
tokenized = train_df["excerpt"].apply(preprocess_function)
train_df['input_ids'] = [t['input_ids'] for t in tokenized]
train_df['attention_mask'] = [t['attention_mask'] for t in tokenized]

In [35]:
train_df.drop(['excerpt'] , axis=1 , inplace = True)

In [36]:
train_df = train_df.rename(columns={'target':'labels'})

In [37]:
train_df.head()

Unnamed: 0,labels,input_ids,attention_mask
0,-0.340259,"[101, 2043, 1996, 2402, 2111, 2513, 2000, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,-0.315372,"[101, 2035, 2083, 4596, 2051, 1010, 3680, 1012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,-0.580118,"[101, 2004, 5074, 2018, 10173, 1010, 1996, 458...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,-1.054013,"[101, 1998, 2648, 2077, 1996, 4186, 1037, 2307...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,0.247197,"[101, 2320, 2588, 1037, 2051, 2045, 2020, 2093...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [38]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

In [39]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [40]:
from datasets import Dataset
tokenized_dataset = Dataset.from_pandas(train_df)
sample_count = len(tokenized_dataset)
train_count = int(0.9 * sample_count)

train_dataset = tokenized_dataset.select(range(train_count))
eval_dataset = tokenized_dataset.select(range(train_count, sample_count))
# train_dataset, test_dataset = train_test_split(dataset, test_size = 0.2, random_state = 0)

In [41]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


training_args = TrainingArguments(
    output_dir="distillbert-age-predictor",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=25,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits, labels = logits.squeeze(), labels.squeeze()
    rmse = np.sqrt(np.mean((labels - logits) ** 2))
    return {'RMSE': rmse}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

***** Running training *****
  Num examples = 2550
  Num Epochs = 25
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4000
  Number of trainable parameters = 66954241
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,0.35583,0.596515
2,No log,0.320979,0.56655
3,No log,0.352338,0.59358
4,0.301800,0.453825,0.673666
5,0.301800,0.396609,0.629769
6,0.301800,0.349531,0.591212
7,0.065700,0.368422,0.606978
8,0.065700,0.344678,0.587093
9,0.065700,0.392507,0.626504
10,0.029900,0.318644,0.564486


***** Running Evaluation *****
  Num examples = 284
  Batch size = 16
Saving model checkpoint to distillbert-age-predictor/checkpoint-160
Configuration saved in distillbert-age-predictor/checkpoint-160/config.json
Model weights saved in distillbert-age-predictor/checkpoint-160/pytorch_model.bin
tokenizer config file saved in distillbert-age-predictor/checkpoint-160/tokenizer_config.json
Special tokens file saved in distillbert-age-predictor/checkpoint-160/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 284
  Batch size = 16
Saving model checkpoint to distillbert-age-predictor/checkpoint-320
Configuration saved in distillbert-age-predictor/checkpoint-320/config.json
Model weights saved in distillbert-age-predictor/checkpoint-320/pytorch_model.bin
tokenizer config file saved in distillbert-age-predictor/checkpoint-320/tokenizer_config.json
Special tokens file saved in distillbert-age-predictor/checkpoint-320/special_tokens_map.json
***** Running Evaluation *****
 

TrainOutput(global_step=4000, training_loss=0.057776894748210905, metrics={'train_runtime': 1170.1501, 'train_samples_per_second': 54.48, 'train_steps_per_second': 3.418, 'total_flos': 5178943093545000.0, 'train_loss': 0.057776894748210905, 'epoch': 25.0})