In [1]:
import general_utils as gu
data = gu.read_data("dataset.csv")

In [2]:
def scale_score(score, lower_bound, upper_bound):
    range = upper_bound - lower_bound
    return (score - lower_bound) / range # we divide the difference from the minimum value by the range to get a value between 0 and 1
def unscale_score(scaled_score, lower_bound, upper_bound):
    range=upper_bound-lower_bound
    return scaled_score*range+lower_bound


In [3]:
prompts_dicts=[] # we will store the data for each prompt in a list of dictionaries, one dictionary 
#  one dictionary  will have prompt_id and a list of essays for that prompt with their features and scaled holistic scores
for prompt_id in range(1,9):
    essays_for_prompt=[]
    for line_num in range(len(data['essay_ids'])):
        
        if data['prompt_ids'][line_num]==prompt_id:
            range_score=gu.SCORE_RANGES[prompt_id]['holistic']
            essays_for_prompt.append({'essay_id':data['essay_ids'][line_num],'holistic':scale_score(data['holistic'][line_num],range_score[0],range_score[1]),'features':data['features'][line_num]})

    prompts_dicts.append({'prompt_id':prompt_id,'essays':essays_for_prompt})
    
    




In [None]:

import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch
num_hidden_layers = [1,2,4,8]
num_hidden_units_per_layer = [8,16,32]
learning_rate = [0.001,0.01,0.1]
batch_size_initial=4
batch_size = [8,16,32]
# design the layers based on input
def get_layers( input_size, layers_size,hidden_size, output_size):
    layers = []
    for i in range(layers_size):
        if i == 0:
            layers.append(nn.Linear(input_size, hidden_size))
        else:
            layers.append(nn.Linear(hidden_size, hidden_size))
        layers.append(nn.ReLU())
    layers.append(nn.Linear(hidden_size, output_size))
    return layers
class HolisticFFN(nn.Module):
    def __init__(self, input_size, layers_size,hidden_size, output_size):
        super(HolisticFFN, self).__init__()
        layers=get_layers(input_size, layers_size,hidden_size, output_size)
        self.model=nn.Sequential(*layers) # put the layers in sequential order in the neurtal network model
        
     
        self.apply(self.weights_HE_init)# He initialization for weights

    def forward(self, x):
        return self.model(x)
    def weights_HE_init(self,layer_in):
     if isinstance(layer_in, nn.Linear):
        nn.init.kaiming_normal_(layer_in.weight)
        layer_in.bias.data.fill_(0.0)





In [None]:
import numpy as np
def train_model(model, loss, optimizer, train_set, validation_set, device, range_score, batch_size=4, num_epochs=15):
    best_qwk = 0
    without_improvement = 0 # to decide when to early stop
    best_model = model
    model.to(device)# move the model to the gpu if available in the machine
    train_set = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    validation_setLoader = DataLoader(validation_set)
    

    for epoch in range(num_epochs ):
        model.train() 
        total_loss = 0
        for features, scores in train_set:
            features, scores = features.to(device), scores.to(device)
            
            optimizer.zero_grad()# zero the gradients for the previous iteration
            y_pred = model(features).view(-1)# view(-1) was used to flatten outputo ensure its same as true scores
          
            l = loss(y_pred, scores)
            l.backward()
            optimizer.step()
            total_loss += l.item()
        print(f"Epoch {epoch + 1} loss: {total_loss/len(train_set)}")
        if validation_set is None: # in some cases we may not have validation set like when training the deployed model
            continue
        model.eval()# set the model to evaluation mode
        predicted_scores, validation_scores = [], [] # to store the predicted and true scores
        total_loss = 0
        with torch.no_grad(): # we don't need to calculate gradients for validation set
            
            for features, score in validation_setLoader:
                features, score = features.to(device), score.to(device)

        


                y_pred = model(features).view(-1)
                l=loss(y_pred, score)
                total_loss += l.item()
                
                predicted_scores.append(y_pred.item())
                validation_scores.append(score.item())
               

          # qwk only works with integers
            validation_scores = [int(round(unscale_score(score, range_score[0], range_score[1]))) for score in validation_scores] #
            predicted_scores = [int(round(unscale_score(pred, range_score[0], range_score[1]))) for pred in predicted_scores]
            print('validation_scores:',validation_scores)
            print('predicted_scores:',predicted_scores)
            validation_qwk = gu.quadratic_weighted_kappa(validation_scores, predicted_scores)
            
            if validation_qwk > best_qwk:
                best_qwk = validation_qwk
                best_model = model
                without_improvement = 0
            else:
                without_improvement += 1

            if without_improvement == 4: # wait for 4 epochs without improvement before early stopping
                print(f"Early stopping on epoch {epoch + 1} qwk: {best_qwk}  loss: {total_loss/len(validation_setLoader)}")
                return best_qwk,best_model

    return best_qwk,best_model


In [6]:
import numpy as np
def cross_validation( prompt_num,layer_size,hidden_size,lr, device,batch_size=4, num_epochs=15):

    avg_QWKs = []
    # each prompt will be used as the validation set once
    for fold in range(8): 
        if prompt_num == fold+1: # we don't want to train on the validation set
            continue
        range_score=gu.SCORE_RANGES[fold+1]['holistic']
        # normal lists are not compatible with pytorch, so we convert them to  numpy arrays and then to tensors
        features = torch.tensor(np.array([essay['features'] for essay in prompts_dicts[fold]['essays']], dtype=np.float32)).to(device)
        scores = torch.tensor(np.array([essay['holistic'] for essay in prompts_dicts[fold]['essays']], dtype=np.float32)).to(device)

        validation_dataset=TensorDataset(features, scores)
        train_features = []
        train_scores = []
        for prompt_index in range(8):
          # we don't want to train on the validation set nor the test set
          if prompt_index == fold:
        
              continue
          if prompt_num == fold+1:
            continue
          train_features.extend([essay['features'] for essay in prompts_dicts[prompt_index]['essays']])
          train_scores.extend([essay['holistic'] for essay in prompts_dicts[prompt_index]['essays']])
        model=HolisticFFN(86,layer_size,hidden_size,1)
        model.to(device)
        loss=nn.MSELoss()

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr,betas=(0.9, 0.999),  weight_decay=0.1)
       # normal lists are not compatible with pytorch, so we convert them to  numpy arrays and then to tensors
        train_features = torch.tensor(np.array(train_features, dtype=np.float32), dtype=torch.float32).to(device)
        train_scores = torch.tensor(np.array(train_scores, dtype=np.float32), dtype=torch.float32).to(device)
   
        train_dataset = TensorDataset(train_features, train_scores)
       
     
        train_qwk,model=train_model(model, loss, optimizer,train_dataset, validation_dataset,device,range_score,batch_size=batch_size,num_epochs=num_epochs)
        avg_QWKs.append(train_qwk )
    final_avg_QWK = sum(avg_QWKs) / len(avg_QWKs)
    print(f"Average QWK over all folds: {final_avg_QWK}")

    return final_avg_QWK
        


In [7]:
def batch_size_tuning( prompt_num,best_params, batch_sizes, device):
    best_batch_size = None
    best_QWK = 0
    for batch_size in batch_sizes:
        print(f"Testing batch size: {batch_size}")
        QWK = cross_validation(prompt_num,
          best_params[0],
           best_params[1],
            best_params[2],
          device,
            batch_size,
        )
        if QWK > best_QWK:
            best_QWK = QWK
            best_batch_size = batch_size
    print(f"Best Batch Size: {best_batch_size}, QWK: {best_QWK}")
    return best_batch_size,best_QWK


In [None]:
# we will get the training set based on the prompt number
# all prompt will be taken for the set except the prompt number because its used for testing
def get_trainSet(prompt_num,device):
    train_features = []
    train_scores = []
    for prompt_index in range(8):
        if prompt_num and prompt_index == prompt_num-1:
            continue
        train_features.extend([essay['features'] for essay in prompts_dicts[prompt_index]['essays']])
        train_scores.extend([essay['holistic'] for essay in prompts_dicts[prompt_index]['essays']])
    train_features = torch.tensor(np.array(train_features, dtype=np.float32), dtype=torch.float32).to(device)
    train_scores = torch.tensor(np.array(train_scores, dtype=np.float32), dtype=torch.float32).to(device)
    train_dataset = TensorDataset(train_features, train_scores)
    return train_dataset
# we will get the validation set based on the prompt number
def get_validationSet(prompt_num,device):
    features = torch.tensor(np.array([essay['features'] for essay in prompts_dicts[prompt_num-1]['essays']], dtype=np.float32)).to(device)
    scores = torch.tensor(np.array([essay['holistic'] for essay in prompts_dicts[prompt_num-1]['essays']], dtype=np.float32)).to(device)
    validation_dataset = TensorDataset(features, scores)
    return validation_dataset


In [9]:
def find_best_hyperparameters(prompt_num, device):
   
    best_QWK = 0
    best_params = None
    
  
    for layers_size in num_hidden_layers:
        for hidden_size in num_hidden_units_per_layer:
            for lr in learning_rate:
                print(f"Prompt {prompt_num} - Testing: Layers={layers_size}, "
                      f"Hidden size={hidden_size}, Learning rate={lr}")
                
                QWK = cross_validation(prompt_num, layers_size, hidden_size, lr, device, 4)
                
                if QWK > best_QWK:
                    best_QWK = QWK
                    best_params = (layers_size, hidden_size, lr)
                    print(f"New best QWK: {best_QWK:.4f} with params: {best_params}")
    
    # Tune batch size with best parameters
    best_batch_size, batch_qwk= batch_size_tuning(prompt_num, best_params, batch_size, device)
    if best_QWK > batch_qwk:
        best_batch_size = 4
    best_params = (*best_params, best_batch_size)
    
    return best_params, best_QWK


In [10]:
import torch
def train_and_save_models():
   
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")# gpu if available
    best_QWK_ALL = 0
    results = {}


    for prompt_num in range(1, 3):
        print(f"\nTraining model for Prompt {prompt_num}")
        
        best_params, best_QWK = find_best_hyperparameters(prompt_num, device)
        layers_size, hidden_size, lr, batch_size = best_params
        
       ## train model based on best hyperparameters
        model = HolisticFFN(86, layers_size, hidden_size, 1)
        model.to(device)
        loss = nn.MSELoss()
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, 
                                    betas=(0.9, 0.999), weight_decay=0.1)
        
        train_set = get_trainSet(prompt_num, device)
        val_set = get_validationSet(prompt_num, device)

        qwk, trained_model = train_model(model, loss, optimizer, train_set, 
                                       val_set, device, gu.SCORE_RANGES[prompt_num]['holistic'], 
                                       batch_size, num_epochs=15)
      
        scrtipted_model = torch.jit.script(trained_model)
        scrtipted_model.save(f"model-A-{prompt_num}.pt")
        
        
        results[prompt_num] = {
            'layers_size': layers_size,
            'hidden_size': hidden_size,
            'learning_rate': lr,
            'batch_size': batch_size,
            'qwk': qwk
        }
        
        print(f"\nPrompt {prompt_num} Results:")
        print(f"Best Parameters: {best_params}")
        print(f"Final QWK: {qwk:.4f}")
 
        
        if qwk > best_QWK_ALL:
            best_QWK_ALL = qwk
    
    print("\n Summary of Results:")
    for prompt_num, params in results.items():
        print(f"\nPrompt {prompt_num}:")
        print(f"Layers: {params['layers_size']}")
        print(f"Hidden Size: {params['hidden_size']}")
        print(f"Learning Rate: {params['learning_rate']}")
        print(f"Batch Size: {params['batch_size']}")
        print(f"QWK: {params['qwk']:.4f}")
    
    print(f"\nBest Overall QWK: {best_QWK_ALL:.4f}")
    return best_QWK_ALL, results

In [11]:

train_and_save_models()


Training model for Prompt 1
Prompt 1 - Testing: Layers=1, Hidden size=8, Learning rate=0.001
Epoch 1 loss: 0.033879262082579034
validation_scores: [4, 1, 2, 4, 4, 4, 5, 2, 4, 4, 3, 3, 5, 3, 3, 3, 4, 3, 1, 3, 3, 3, 3, 4, 3, 4, 4, 2, 3, 3, 4, 3, 3, 4, 3, 4, 4, 3, 3, 4, 3, 4, 4, 4, 4, 4, 4, 3, 4, 3, 5, 3, 4, 3, 4, 4, 4, 2, 3, 4, 2, 3, 3, 4, 4, 3, 3, 4, 3, 3, 3, 4, 2, 4, 4, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 1, 4, 2, 4, 3, 4, 4, 4, 4, 4, 3, 5, 3, 3, 4, 4, 3, 3, 3, 3, 2, 2, 3, 3, 3, 2, 3, 3, 4, 2, 1, 3, 3, 4, 3, 3, 3, 3, 5, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2, 4, 3, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 4, 3, 2, 3, 3, 4, 3, 4, 4, 3, 3, 4, 4, 2, 1, 3, 3, 4, 5, 3, 4, 4, 3, 4, 3, 3, 5, 4, 3, 3, 4, 4, 1, 4, 4, 3, 4, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 2, 4, 4, 4, 5, 4, 3, 3, 4, 3, 4, 3, 4, 3, 4, 4, 4, 3, 4, 4, 3, 4, 3, 3, 3, 3, 3, 4, 4, 3, 4, 4, 2, 4, 3, 3, 3, 4, 4, 2, 2, 4, 2, 1, 4, 4, 2, 2, 3, 4, 4, 3, 2, 3, 4, 4, 5, 3, 3, 1, 4, 2, 4, 4,

KeyboardInterrupt: 

In [25]:
## train the deployed model
import torch
def train_Deployed():
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            model = HolisticFFN(86, 1, 32, 1)
            model.to(device)
            loss = nn.MSELoss()
            optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=0.1)
            train_set = get_trainSet(None, device)

            qwk, trained_model = train_model(model, loss, optimizer, train_set, None, device,None, 8, num_epochs=15)
            scrtipted_model = torch.jit.script(trained_model)
            scrtipted_model.save("model-A-deploy.pt")

    

train_Deployed()

Epoch 1 loss: 0.026377689563606106
Epoch 2 loss: 0.02124779043692186
Epoch 3 loss: 0.020774515122120668
Epoch 4 loss: 0.020323486224034487
Epoch 5 loss: 0.02031435543136113
Epoch 6 loss: 0.02011306774689105
Epoch 7 loss: 0.019970201749171437
Epoch 8 loss: 0.019756000666777487
Epoch 9 loss: 0.01970939425992436
Epoch 10 loss: 0.019839951791915396
Epoch 11 loss: 0.01958637582300387
Epoch 12 loss: 0.01963564121720907
Epoch 13 loss: 0.019618336626383938
Epoch 14 loss: 0.019421054552950574
Epoch 15 loss: 0.01951118941021493
