In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torchtext
import os
import nltk
from nltk.stem import SnowballStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from copy import deepcopy

# this notebook was ran on kaggle and dataset is available as CommonLitReadability so first we need to import the dataset into the notebook

In [None]:
train_df_path = '../input/commonlitreadabilityprize/train.csv'
test_df_path = '../input/commonlitreadabilityprize/test.csv'

In [None]:
train_df = pd.read_csv(train_df_path).iloc[:,[3,4]]
test_df = pd.read_csv(test_df_path).iloc[:,[0, -1]]

# in this cell we tokenize the words using torchtext tokenizer and then stem the words with Snowball Stemmer and in the end we split the the training and validation sets then convert them to sequences and pad them.

In [None]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
stemmer = SnowballStemmer('english')
sent = list(train_df.iloc[:,0])
sent = [tokenizer(x) for x in sent]
sent = [[stemmer.stem(x) for x in sen] for sen in sent]
max_len = max([len(x) for x in sent])
sent = [' '.join(x) for x in sent]
score = list(train_df.iloc[:,1])
xtr, xte, ytr, yte = train_test_split(sent, score, test_size = 0.1)
tokenizer = Tokenizer(oov_token = '<oov>')
tokenizer.fit_on_texts(xtr)
xtr = tokenizer.texts_to_sequences(xtr)
xte = tokenizer.texts_to_sequences(xte)
xtr = np.array(pad_sequences(xtr , maxlen = max_len), dtype = np.int64)
xte = np.array(pad_sequences(xte , maxlen = max_len), dtype = np.int64)
ytr = np.array(ytr, dtype = np.float32)
yte = np.array(yte, dtype = np.float32)

In [None]:
class LitDataset(Dataset):
    def __init__(self, seqs, scores, seq_lens):
        self.sequences = torch.from_numpy(seqs)
        self.scores = torch.from_numpy(scores)
        self.sequence_lens = torch.from_numpy(seq_lens)
    def __len__(self):
        return len(self.scores)
    def __getitem__(self, idx):
        return self.sequences[idx], self.scores[idx], self.sequence_lens[idx]

In [None]:
train_seq_lens = np.array([260] * len(xtr), dtype = np.int64)
val_seq_lens = np.array([260] * len(xte), dtype = np.int64)
train_data = LitDataset(xtr, ytr, train_seq_lens)
val_data = LitDataset(xte, yte, val_seq_lens)
train_data = DataLoader(train_data, batch_size = 8)
val_data = DataLoader(val_data)

# we initialize the model with an embedding layer and 2 LSTM layers and 3 Linear layers and then we use Adam optimizer and MSE loss to train the model.

In [None]:
class ScoreModel(nn.Module):
    def __init__(self, embedding_dim):
        super(ScoreModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(len(tokenizer.word_counts)+1, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 256, num_layers = 2, batch_first = True)
        self.fc1 = nn.Linear(256 , 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = torch.mean(x, dim = 1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

In [None]:
model = ScoreModel(256)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
criteria = nn.MSELoss()

In [None]:
def train_model(model, train_data, optimizer, criteria, epochs, val_data = None, device = 'cpu'):
    model = model.to(device)
    model_state = None
    best_val = 10
    for epoch in range(epochs):
        train_loss = 0
        val_loss = 0
        model.train()
        for seq, score, lens in tqdm(train_data):
            try:
                seq = seq.to(device)
                score = score.to(device)
                pred = model(seq)[:,0]
                loss = criteria(pred, score)
                train_loss += loss.item()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            except:
                continue
        train_loss /= train_data.__len__()
        if val_data:
            model.eval()
            for seq, score, lens in val_data:
                try:
                    seq = seq.to(device)
                    score = score.to(device)
                    pred = model(seq)[:,0]
                    loss = criteria(pred , score)
                    val_loss += loss.item()
                except:
                    continue
            val_loss /= val_data.__len__()
            if val_loss < best_val:
                model_state = deepcopy(model.state_dict())
            print('epoch:', epoch, 'loss:', train_loss, ';;;; val_loss:', val_loss)
        else:
            print('epoch:', epoch, 'loss: ', train_loss)
    if val_data:
        model.load_state_dict(model_state)

In [None]:
train_model(model, train_data, optimizer, criteria, 7, val_data = val_data, device = 'cpu')

  0%|          | 0/319 [00:00<?, ?it/s]

epoch: 0 loss: 1.066289151396871 ;;;; val_loss: 0.7163879023121718


  0%|          | 0/319 [00:00<?, ?it/s]

epoch: 1 loss: 0.6929788043880164 ;;;; val_loss: 0.6708163028998274


  0%|          | 0/319 [00:00<?, ?it/s]

epoch: 2 loss: 0.5442888828047017 ;;;; val_loss: 0.62644469527444


  0%|          | 0/319 [00:00<?, ?it/s]

epoch: 3 loss: 0.4625589349613668 ;;;; val_loss: 0.6123941312091367


  0%|          | 0/319 [00:00<?, ?it/s]

epoch: 4 loss: 0.4015739926700495 ;;;; val_loss: 0.6024644752968101


  0%|          | 0/319 [00:00<?, ?it/s]

epoch: 5 loss: 0.3476442074911161 ;;;; val_loss: 0.5973101808280469


  0%|          | 0/319 [00:00<?, ?it/s]

epoch: 6 loss: 0.295628444399673 ;;;; val_loss: 0.5914344753778544


# now we trained our model and we can test it using the test data

In [None]:
test_data = list(test_df.iloc[:,1])
tok = torchtext.data.utils.get_tokenizer('basic_english')
stemmer = SnowballStemmer('english')
sent = test_data
sent = [tok(x) for x in sent]
sent = [[stemmer.stem(x) for x in sen] for sen in sent]
max_len = max([len(x) for x in sent])
sent = [' '.join(x) for x in sent]
test_data = tokenizer.texts_to_sequences(sent)
test_data = np.array(pad_sequences(test_data, maxlen = 260), dtype = np.int64)

In [None]:
class LitDatasetTest(Dataset):
    def __init__(self, seqs):
        self.sequences = torch.from_numpy(seqs)
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return self.sequences[idx]

In [None]:
test_data = LitDatasetTest(test_data)
test_data = DataLoader(test_data, batch_size = test_data.__len__())

In [None]:
data = next(iter(test_data))
pred = model(data)

In [None]:
pred = pred[:,0].detach().numpy()

In [None]:
names = test_df.iloc[:,0]
df = pd.DataFrame({'id':names , 'target':pred})

In [None]:
df.to_csv('./submission.csv', index = False)