In [4]:
import pandas as pd
import numpy as np
import wandb
import torch
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer,AutoModel
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
import torch.nn as nn
import torch.nn.functional as F
import os

In [5]:
WANDB_API_KEY="6b5ffde6e58a9b6194c2496b2c3abbb224a784f8"
wandb.login(key=WANDB_API_KEY)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maryamansriram[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/madlad/.netrc


True

In [6]:
#!echo '6b5ffde6e58a9b6194c2496b2c3abbb224a784f8' > wandb_api_key.txt

In [7]:
if not os.path.exists("Models/"):
    os.makedirs("Models/")

In [8]:
data = pd.read_csv("../../data/train.csv")

data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [9]:
data["cohesion"].argmax()

104

In [10]:
data["full_text"][data["cohesion"].argmin()]

'my name is Generic_Name and my story is about cars\n\nI bay a car and it new i live that car so mush so i stirt taking caer to it and befor I usto have a buskel and i usto love that baskel so mauh but i gat older so i stop riding a biskel and i stirt looking for a beges thengs whan i was a kid i usto so my dad driveng a car and i usto haveng fun in it\n\n;and my drimm was to bay a car win i gro up and i stil looking for that drimm seccer is the only sport i like i use to play seccer win i weas 11 years old and i stil play that geame and i love it so mush but my lieg get hert so i stop playing it any moer and then i stirt fling better so i want to go bake to play my favert game seccer so i try ot for school and i stirt making frinds and play with them my favert game evry day i even like a gril becoes that game so l like that gril and i stirt playing wiht her and taking wihe her and i wanted her number so i stirt beang cloes to her so it bean a year so i finly say it i told her that i l

In [11]:
def MCRMSE(preds,true):
    mse = np.square(preds-true)
    mean_col = np.mean(mse,axis=0)
    #print("Mean_col: ",mean_col)
    final_mean = np.mean(mean_col)
    return final_mean

In [12]:
## Create Pytorch Dataset

LABEL_COLUMNS = ["cohesion","syntax","vocabulary","phraseology","grammar","conventions"]
class EssayDataset(Dataset):
    def __init__(self,df):
        self.essays = df.full_text.values
        self.metrics = df.loc[:,LABEL_COLUMNS].values
        
    def __len__(self):
        return len(self.essays)
    
    def __getitem__(self,idx):
        essay = self.essays[idx]
        label = self.metrics[idx]
        return essay,torch.Tensor(label)
        

In [13]:
class EssayDatasetTest(Dataset):
    def __init__(self,df):
        self.essays = df.full_text.values
        
    def __len__(self):
        return len(self.essays)
    
    def __getitem__(self,idx):
        essay = self.essays[idx]
        return essay

In [14]:
from sklearn.model_selection import train_test_split
train,valid = train_test_split(data,test_size=0.2,random_state=42)

In [15]:
train_dataset = EssayDataset(train)
val_dataset = EssayDataset(valid)

train_dataloader = DataLoader(train_dataset,batch_size=8)
val_dataloader = DataLoader(val_dataset,batch_size=32)

In [16]:
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained("bert-base-uncased")


In [17]:
class EssayModel(pl.LightningModule):
    def __init__(self,config):
        super().__init__()
        self.save_hyperparameters()
        self.dev = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.lr = config["lr"]
        self.weight_decay = config["weight_decay"]
        self.loss_fn = nn.MSELoss()
        if config["backbone"]=="bert":
            self.bert = AutoModel.from_pretrained("bert-base-uncased")
            self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
            self.fc_size=768
        elif config["backbone"]=="deberta":
            self.bert = AutoModel.from_pretrained("microsoft/deberta-base")
            self.bert_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
            self.fc_size=768
        elif config["backbone"]=="roberta":
            self.bert = AutoModel.from_pretrained("roberta-large")
            self.bert_tokenizer = AutoTokenizer.from_pretrained("roberta-large")
            self.fc_size=1024
        elif config["backbone"]=="distilbert":
            self.bert = AutoModel.from_pretrained("distilbert-base-uncased")
            self.bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
            self.fc_size=768
        
        elif config["backbone"]=="ensemble":
            self.bert = AutoModel.from_pretrained("bert-base-uncased")
            self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
            
            self.distilbert = AutoModel.from_pretrained("distilbert-base-uncased")
            self.distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
            self.fc_size=1536
            for param in self.distilbert.parameters():
                param.requires_grad=False
        
        
        if config["only_embeddings"]:
            for param in self.bert.parameters():
                param.requires_grad=False
            
        self.pool = nn.AvgPool1d(5,2)
        self.l1 = nn.Linear(self.fc_size,500)
        self.l2 =  nn.Linear(500,200)
        self.l3 = nn.Linear(200,6)
        self.init_weights(self.l3)
    
    def init_weights(self,module):
        if isinstance(module,nn.Linear):
            module.bias.data.fill_(3.5)
            print("Weight initialized")
        
    def forward(self,inputs):
        if config["backbone"]=="ensemble":
            X_tokens1 = self.bert_tokenizer(list(inputs),padding=True,truncation=True,return_tensors="pt")
            X_tokens1.to(self.dev)
            X_tokens3 = self.distilbert_tokenizer(list(inputs),padding=True,truncation=True,return_tensors="pt")
            X_tokens3.to(self.dev)
            
            x1 = self.bert(**X_tokens1)
            x3 = self.distilbert(**X_tokens3)
            try:
                hidden_states = torch.cat([x1.last_hidden_state,x3.last_hidden_state],dim=2)
            except:
                print(x1.last_hidden_state.shape)
                print(x3.last_hidden_state.shape)
            attn_mask1 = X_tokens1["attention_mask"]
            attn_mask3 = X_tokens3["attention_mask"]
            attention_mask = (1/2)*(attn_mask1+attn_mask3)
            
            
        else:
            X_tokens = self.bert_tokenizer(list(inputs),padding=True,truncation=True,return_tensors="pt")
            X_tokens.to(self.dev)
            x = self.bert(**X_tokens)
            hidden_states = x[0]
            attention_mask = X_tokens["attention_mask"]
        
        mask = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
        temp = torch.sum(hidden_states*mask,1)
        
        x = F.relu(self.l1(temp))
        x = F.relu(self.l2(x))
        out = self.l3(x)
        return out
    
    def training_step(self,batch,batch_idx):
        X,y = batch
        
        y.to(self.dev)
        out = self(X)
        loss = self.loss_fn(out,y)
        
            
        return loss
    
    def validation_step(self,batch,batch_idx):
        X,y = batch
        
        out = self(X)
        
        loss = self.loss_fn(out,y)
        
        out = out.detach().cpu().numpy()
        y = y.detach().cpu().numpy()
        
        metric = MCRMSE(out,y)
        
        return {"metric":metric,"loss":loss}
    
    
    
    def training_epoch_end(self,outputs):
        mean_loss = np.mean(np.array([t['loss'].detach().cpu().numpy() for t in outputs]))
        self.log("Train epoch loss: ",mean_loss)
        
        
        
    def validation_epoch_end(self,outputs):
        
        mean_metric = np.mean(np.array([t["metric"] for t in outputs]))
        mean_loss = np.mean(np.array([t["loss"].detach().cpu().numpy() for t in outputs]))
        self.log("val_error",mean_metric)
        self.log("Val epoch loss: ",mean_loss)
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(),lr=self.lr,weight_decay=self.weight_decay)
    
    
    
        
        
        

In [15]:
config = {
 "lr": 1e-3,
 "weight_decay": 1e-3,
 "backbone": "bert",
 "only_embeddings": True,
 "epochs": 15,
 "log_run": "bert_embeddings"
}

In [16]:
def train(config):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger = WandbLogger(project="EssayGrader",log_model=True,name=config["log_run"]+"_lr_"+str(config["lr"])+"_epochs_"+str(config["epochs"])+"_decay_"+str(config["weight_decay"]))
    if device=='cuda':
        e_stopping = EarlyStopping(monitor="val_error",mode="min")
        checkpoint_callback = ModelCheckpoint(dirpath="Models/", monitor="val_error")
        trainer = pl.Trainer(logger=logger,max_epochs=config["epochs"],accelerator='gpu',devices=1,callbacks=[e_stopping,checkpoint_callback])
        model = EssayModel(config)
        model.to(device)
        trainer.fit(model=model,train_dataloaders=train_dataloader,val_dataloaders=val_dataloader)
    else:
        print("Device is not gpu!!")

In [17]:
train(config)

[34m[1mwandb[0m: Currently logged in as: [33maryamansriram[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Weight initialized


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]