In [37]:
import general_utils as gu
data = gu.read_data("dataset.csv")

In [38]:
def scale_score(score, lower_bound, upper_bound):
    range = upper_bound - lower_bound
    return (score - lower_bound) / range # we divide the difference from the minimum value by the range to get a value between 0 and 1
def unscale_score(scaled_score, lower_bound, upper_bound):
    range=upper_bound-lower_bound
    return scaled_score*range+lower_bound


In [39]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def get_bert_embedding_essay(essay):
    embedding=tokenizer(essay,add_special_tokens=True,padding='max_length',truncation=True,return_tensors='pt')
    return embedding['input_ids'].view(-1),embedding['attention_mask'].view(-1)
    



In [40]:
prompts_dicts=[] # we will store the data for each prompt in a list of dictionaries, one dictionary 
#  one dictionary  will have prompt_id and a list of essays for that prompt with their features and scaled holistic scores
for prompt_id in range(1,9):
    essays_for_prompt=[]
    for line_num in range(len(data['essay_ids'])):
        
        if data['prompt_ids'][line_num]==prompt_id:
            range_score=gu.SCORE_RANGES[prompt_id]['holistic']
            input_ids,attention_mask=get_bert_embedding_essay(data["essay_text"][line_num])
            essays_for_prompt.append({'essay_text':data["essay_text"][line_num],'holistic':scale_score(data['holistic'][line_num],range_score[0],range_score[1]),'features':data['features'][line_num],'input_ids':input_ids,'attention_mask':attention_mask})

    prompts_dicts.append({'prompt_id':prompt_id,'essays':essays_for_prompt})
    
    




In [None]:
import torch
import torch.nn as nn
from transformers import BertModel
from transformers import BertModel


def get_layers(input_size, layers_size, hidden_size, output_size):
    layers = []
    if layers_size == 0:
        # No hidden layers; direct input to output
        layers.append(nn.Linear(input_size, output_size))
    else:
            layers.append(nn.Linear(input_size, hidden_size))
            layers.append(nn.ReLU())
          
            layers.append(nn.Linear(hidden_size, output_size))
    return layers

class BERT_approachC(nn.Module):
    def __init__(self,bert_model,layers_size,num_hidden_units,input_size=86,output_size=1):
        super(BERT_approachC,self).__init__()
        self.bert=BertModel.from_pretrained('bert-base-uncased')
        self.additional_layer=nn.Sequential(*get_layers(self.bert.config.hidden_size+input_size,layers_size,num_hidden_units,output_size)
            
        )
        
        self.apply(self.weights_HE_init)
    def forward(self,input_ids,attention_mask,features):
        output=self.bert(input_ids=input_ids,attention_mask=attention_mask)
        cls=output['pooler_output']
    
        cls_features=torch.cat((cls,features),dim=1)
  
        return self.additional_layer(cls_features)
    def weights_HE_init(self,layer_in):
     if isinstance(layer_in, nn.Linear):
        nn.init.kaiming_normal_(layer_in.weight)
        layer_in.bias.data.fill_(0.0)


In [None]:
import numpy as np
from torch.utils.data import Dataset, DataLoader
def train_model(model, loss, optimizer, train_set, validation_set, device, range_score, batch_size=4, num_epochs=5):
    last_qwk = 0

    model.to(device)# move the model to the gpu if available in the machine
    train_set = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    validation_setLoader = DataLoader(validation_set)
    

    for epoch in range(num_epochs ):
        print("*"*10)
        model.train() 
        total_loss = 0
        for features, scores,input_ids,attention_mask in train_set:
            features, scores = features.to(device), scores.to(device)
            input_ids,attention_mask=input_ids.to(device),attention_mask.to(device)
            
            
            optimizer.zero_grad()# zero the gradients for the previous iteration


            y_pred = model(input_ids,attention_mask,features).view(-1)# view(-1) was used to flatten outputo ensure its same as true scores
            scores = scores
            l = loss(y_pred, scores)
            l.backward()
            optimizer.step()
            total_loss += l.item()
        print(f"Epoch {epoch + 1} loss: {total_loss/len(train_set)}")
        if validation_set is None: # in some cases we may not have validation set like when training the deployed model
            continue
        model.eval()# set the model to evaluation mode
        predicted_scores, validation_scores = [], [] # to store the predicted and true scores
        total_loss = 0
        with torch.no_grad(): # we don't need to calculate gradients for validation set
            
            for features, scores,input_ids,attention_mask in validation_setLoader:
                features, scores = features.to(device), scores.to(device)
                input_ids,attention_mask=input_ids.to(device),attention_mask.to(device)

        


                y_pred = model(input_ids,attention_mask,features).view(-1)
                l=loss(y_pred, scores)
                total_loss += l.item()
       
             
                predicted_scores.append(y_pred.item())
                validation_scores.append(scores.item())

          # qwk only works with integers
            validation_scores = [int(round(unscale_score(score, range_score[0], range_score[1]))) for score in validation_scores] #
            predicted_scores = [int(round(unscale_score(pred, range_score[0], range_score[1]))) for pred in predicted_scores]
      
            last_qwk = gu.quadratic_weighted_kappa(validation_scores, predicted_scores)
            print(f"{epoch + 1} qwk: {last_qwk}  ")

    
    return last_qwk,model


In [None]:
import numpy as np
from torch.utils.data import TensorDataset
from transformers import BertModel
def cross_validation( layer_size,hidden_size, device,batch_size=4, num_epochs=5,prompt_num=1):

    avg_QWKs = []
    # each prompt will be used as the validation set once
    for fold in range(8): 
        print('fold: ',fold)
        if prompt_num == fold+1: # we don't want to train on the validation set
            continue
        range_score=gu.SCORE_RANGES[fold+1]['holistic']
        # normal lists are not compatible with pytorch, so we convert them to  numpy arrays and then to tensors
        features = torch.tensor(np.array([essay['features'] for essay in prompts_dicts[fold]['essays']], dtype=np.float32)).to(device)
        scores = torch.tensor(np.array([essay['holistic'] for essay in prompts_dicts[fold]['essays']], dtype=np.float32)).to(device)
        input_ids = torch.tensor(np.array([essay['input_ids'] for essay in prompts_dicts[fold]['essays']], dtype=np.int64)).to(device)
        attention_mask = torch.tensor(np.array([essay['attention_mask'] for essay in prompts_dicts[fold]['essays']], dtype=np.int64)).to(device)
        validation_dataset=TensorDataset(features, scores,input_ids,attention_mask)

        train_features = []
        train_scores = []
        train_input_ids = []
        train_attention_mask = []
        for prompt_index in range(8):
          # we don't want to train on the validation set nor the test set
          if prompt_num == fold+1 or prompt_index==fold:
            continue
          train_features.extend([essay['features'] for essay in prompts_dicts[prompt_index]['essays']])
          train_scores.extend([essay['holistic'] for essay in prompts_dicts[prompt_index]['essays']])
          train_input_ids.extend([essay['input_ids'] for essay in prompts_dicts[prompt_index]['essays']])
          train_attention_mask.extend([essay['attention_mask'] for essay in prompts_dicts[prompt_index]['essays']])

        model=BERT_approachC('bert-base-uncased',layer_size,hidden_size,86,1)

        model.to(device)
        loss=nn.MSELoss()

        optimizer = torch.optim.AdamW(model.parameters(), 0.005,betas=(0.9, 0.999),  weight_decay=0.1)
       # normal lists are not compatible with pytorch, so we convert them to  numpy arrays and then to tensors
        train_features = torch.tensor(np.array(train_features, dtype=np.float32), dtype=torch.float32).to(device)
        train_scores = torch.tensor(np.array(train_scores, dtype=np.float32), dtype=torch.float32).to(device)
        train_attention_mask = torch.tensor(np.array(train_attention_mask, dtype=np.int64), dtype=torch.int64).to(device)
        train_input_ids = torch.tensor(np.array(train_input_ids, dtype=np.int64), dtype=torch.int64).to(device)
   
        train_dataset = TensorDataset(train_features, train_scores,train_input_ids,train_attention_mask)
       
     
        train_qwk,model=train_model(model, loss, optimizer,train_dataset, validation_dataset,device,range_score,batch_size=batch_size,num_epochs=num_epochs)
        avg_QWKs.append(train_qwk )
    final_avg_QWK = sum(avg_QWKs) / len(avg_QWKs)
    print(f"Average QWK over all folds: {final_avg_QWK}")

    return final_avg_QWK
        


In [59]:
hyper_options=[{'hidden':0},{'hidden':4},{'hidden':8}]

def find_best_hyperparameters(device,prompt_num=1):
   
    best_QWK = 0
    best_params = None
    latest_params = None
  
    for option in hyper_options:
        print(f"Testing option: {option}")
        if option['hidden'] == 0:
           QWK=cross_validation(0, 0, device, 32,5,1)
           latest_params = (0, 0)
              
        elif(option['hidden'] == 4):
            QWK=cross_validation(1, 4, device, 32,5,1)
            latest_params = (1, 4)
        elif(option['hidden'] == 8):
            QWK=cross_validation(1, 8, device, 32,5,1)
            latest_params = (1, 8)
            
        if QWK > best_QWK:
                    best_QWK = QWK
                    best_params =latest_params
                    print(f"New best QWK: {best_QWK:.4f} with params: {best_params}")
    


    
    return best_params, best_QWK


In [None]:

import torch
from transformers import BertModel
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def get_validation_set():
    valid_scores=torch.tensor(np.array([essay['holistic'] for essay in prompts_dicts[0]['essays']], dtype=np.float32), dtype=torch.float32).to(device)
    valid_features=torch.tensor(np.array([essay['features'] for essay in prompts_dicts[0]['essays']], dtype=np.float32), dtype=torch.float32).to(device)
    valid_input_ids=torch.tensor(np.array([essay['input_ids'] for essay in prompts_dicts[0]['essays']], dtype=np.int64), dtype=torch.int64).to(device)
    valid_attention_mask=torch.tensor(np.array([essay['attention_mask'] for essay in prompts_dicts[0]['essays']], dtype=np.int64), dtype=torch.int64).to(device)
    val_set = TensorDataset(valid_features, valid_scores,valid_input_ids,valid_attention_mask)
    return val_set
def get_train_set():
    train_features = []
    train_scores = []
    train_input_ids = []
    train_attention_mask = []
    for prompt_index in range(8):
        if prompt_index==0:
            continue
        train_features.extend([essay['features'] for essay in prompts_dicts[prompt_index]['essays']])
        train_scores.extend([essay['holistic'] for essay in prompts_dicts[prompt_index]['essays']])
        train_input_ids.extend([essay['input_ids'] for essay in prompts_dicts[prompt_index]['essays']])
        train_attention_mask.extend([essay['attention_mask'] for essay in prompts_dicts[prompt_index]['essays']])
    train_features = torch.tensor(np.array(train_features, dtype=np.float32), dtype=torch.float32).to(device)
    train_scores = torch.tensor(np.array(train_scores, dtype=np.float32), dtype=torch.float32).to(device)
    train_attention_mask = torch.tensor(np.array(train_attention_mask, dtype=np.int64), dtype=torch.int64).to(device)
    train_input_ids = torch.tensor(np.array(train_input_ids, dtype=np.int64), dtype=torch.int64).to(device)
    train_set = TensorDataset(train_features, train_scores,train_input_ids,train_attention_mask)
    return train_set
def train_and_save_models():
   
# gpu if available




   
   
    
    best_params, best_QWK = find_best_hyperparameters(device, 1)
    layers_size, hidden_size= best_params
    
    ## train model based on best hyperparameters
    model = BERT_approachC('bert-base-uncased', layers_size, hidden_size, 86,1)
    model.to(device)
    loss = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.005, 
                                betas=(0.9, 0.999), weight_decay=0.1)
    val_set=get_validation_set()
    train_set=get_train_set()
    

    qwk, trained_model = train_model(model, loss, optimizer, train_set, 
                                    val_set, device, gu.SCORE_RANGES[1]['holistic'], 
                                    32, num_epochs=5)
    print('params:',best_params)
    print('best qwk:',best_QWK)
    torch.save(trained_model.state_dict(), f"model_C_1Final++.pt")
    
    
    results = {
        'prompt num': 1,
        'layers_size': layers_size,
        'hidden_size': hidden_size,
        'batch_size': 32,
        'qwk': qwk
    }
    

    print(results)
    

    print(f"\nBest Overall QWK: {best_QWK:.4f}")
    return best_QWK, results

In [None]:
train_and_save_models()

In [None]:
def load_model_C_1():
    model = BERT_approachC('bert-base-uncased', 0, 0, 86, 1)
    model.load_state_dict(torch.load("model_C_1.pt"))
    return model
def train_Deployed_model(layer_size,hidden_size,device):
    model = BERT_approachC('bert-base-uncased', layer_size, hidden_size, 86, 1)
    model.to(device)
    loss = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.005, 
                                betas=(0.9, 0.999), weight_decay=0.1)
  # train on all data 8 prompts
    features = []
    scores = []
    input_ids = []
    attention_mask = []
    for prompt_index in range(8):
        features.extend([essay['features'] for essay in prompts_dicts[prompt_index]['essays']])
        scores.extend([essay['holistic'] for essay in prompts_dicts[prompt_index]['essays']])
        input_ids.extend([essay['input_ids'] for essay in prompts_dicts[prompt_index]['essays']])
        attention_mask.extend([essay['attention_mask'] for essay in prompts_dicts[prompt_index]['essays']])
    features = torch.tensor(np.array(features, dtype=np.float32), dtype=torch.float32).to(device)
    scores = torch.tensor(np.array(scores, dtype=np.float32), dtype=torch.float32).to(device)
    attention_mask = torch.tensor(np.array(attention_mask, dtype=np.int64), dtype=torch.int64).to(device)
    input_ids = torch.tensor(np.array(input_ids, dtype=np.int64), dtype=torch.int64).to(device)
    dataset = TensorDataset(features, scores,input_ids,attention_mask)
    qwk, trained_model = train_model(model, loss, optimizer, dataset, None, device,None, 32, num_epochs=5)
    torch.save(trained_model.state_dict(), f"model_C_1_deployed.pt")
    return qwk


In [None]:
import torch
from transformers import BertModel
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_Deployed_model(0,0,device)