In [1]:
import pandas as pd
import numpy as np
import wandb
import torch
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer,AutoModel,AutoConfig
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
import torch.nn as nn
import torch.nn.functional as F
import os

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class EssayModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.dev = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.flag = 0
        self.loss_fn = nn.MSELoss()
        self.bert = AutoModel.from_pretrained("Models/bert_model_offline/")
        
        for param in self.bert.parameters():
            param.requires_grad=False
        self.tokenizer = AutoTokenizer.from_pretrained("Models/bert_tokenizer_offline/")
        #self.tokenizer = AutoTokenizer.from_config(AutoConfig.from_pretrained("bert-base-uncased"))
        self.pool = nn.AvgPool1d(5,2)
        self.l1 = nn.Linear(768,500)
        self.l2 =  nn.Linear(500,200)
        self.l3 = nn.Linear(200,6)
        self.init_weights(self.l3)
    
    def init_weights(self,module):
        if isinstance(module,nn.Linear):
            #nn.init.normal(module.weight,3.5,1.5)
            module.bias.data.fill_(3.5)
            print("Weight initialized")
        
    def forward(self,inputs):
        x = self.bert(**inputs)
        hidden_states = x[0]
        attention_mask = inputs["attention_mask"]
        mask = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
        #print("M: ",mask.shape)
        temp = torch.sum(hidden_states*mask,1)
        #temp = torch.flatten(hidden_states,start_dim=1)
        #print("T: ",temp.shape)
        #pooled_output = pooled_output[:, 0, :]
        
        x = F.relu(self.l1(temp))
        #print(x.shape)
        x = F.relu(self.l2(x))
        out = self.l3(x)
        return out
    
    def training_step(self,batch,batch_idx):
        X,y = batch
        
        X_tokens = self.tokenizer(list(X),padding=True,truncation=True,return_tensors="pt")
        X_tokens.to(self.dev)
        y.to(self.dev)
        out = self(X_tokens)
        loss = self.loss_fn(out,y)
        
        if self.flag==0:
            #print("Out: ",out)
            #print("Y: ",y)
            self.flag=1
            
        return loss
    
    def validation_step(self,batch,batch_idx):
        X,y = batch
        
        X_tokens = self.tokenizer(list(X),padding=True,truncation=True,return_tensors="pt")
        X_tokens.to(self.dev)
        out = self(X_tokens)
        
        loss = self.loss_fn(out,y)
        
        out = out.detach().cpu().numpy()
        y = y.detach().cpu().numpy()
        
        metric = MCRMSE(out,y)
        
        return {"metric":metric,"loss":loss}
    
    
    
    def training_epoch_end(self,outputs):
        mean_loss = np.mean(np.array([t['loss'].detach().cpu().numpy() for t in outputs]))
        self.log("Train epoch loss: ",mean_loss)
        self.flag=0
        
        
        
    def validation_epoch_end(self,outputs):
        
        mean_metric = np.mean(np.array([t["metric"] for t in outputs]))
        mean_loss = np.mean(np.array([t["loss"].detach().cpu().numpy() for t in outputs]))
        self.log("Val MCRMSE: ",mean_metric)
        self.log("Val epoch loss: ",mean_loss)
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(),lr=0.001)
    

In [7]:
essay_model = EssayModel()
model = essay_model.load_from_checkpoint("Models/bert_model.ckpt")
model = model.eval()

Weight initialized
Weight initialized


In [10]:
test_data = pd.read_csv("../../data/test.csv")
test_data.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [11]:
class EssayDatasetTest(Dataset):
    def __init__(self,df):
        self.essays = df.full_text.values
        self.ids = df.text_id.values
        
    def __len__(self):
        return len(self.essays)
    
    def __getitem__(self,idx):
        essay = self.essays[idx]
        id = self.ids[idx]
        return id,essay

In [12]:
test_dataset = EssayDatasetTest(test_data)
test_dataloader = DataLoader(test_dataset,batch_size=1,shuffle=False,num_workers=4)

In [13]:
import tqdm
test_texts = test_data.full_text
preds = []
id_list = []
for ids,text in tqdm.tqdm(test_dataloader):
    X_tokens = model.tokenizer(list(text),padding=True,truncation=True,return_tensors="pt")
    with torch.no_grad():
        op = model.forward(X_tokens)
    preds.extend(op.detach().numpy())
    id_list.extend(ids)

preds = np.array(preds)

100%|█████████████████████████████████████████████| 3/3 [00:00<00:00,  3.03it/s]


In [14]:
LABEL_COLUMNS = ["cohesion","syntax","vocabulary","phraseology","grammar","conventions"]
submission = pd.DataFrame()
submission["text_id"] = test_data.text_id
submission.loc[:,LABEL_COLUMNS] = preds
submission.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.932586,2.771767,3.098914,2.928413,2.792662,2.617472
1,000BAD50D026,2.907489,2.65422,2.890331,2.460315,2.534201,2.942874
2,00367BB2546B,3.563079,3.487792,3.653115,3.539544,3.470293,3.496453


In [9]:
submission.to_csv("submission.csv",index=False)