Importing libraries

In [2]:
## Import required libraries
import torch
from torch import nn
from transformers import BertTokenizer, BertModel, AdamW
import random
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
output_dir=r'C:\Users\DELL\Documents\ML4SCI\model_gen-algo_weights'
os.makedirs(output_dir,exist_ok=True)

In [4]:
train_data=torch.load(r'C:\Users\DELL\Documents\ML4SCI\tokenized_train.pt')
val_data=torch.load(r'C:\Users\DELL\Documents\ML4SCI\tokenized_val.pt')
train_data.shape,val_data.shape

((78, 2), (9, 2))

In [5]:
train_data=train_data.reset_index(drop=True)
val_data=val_data.reset_index(drop=True)

In [6]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
POPULATION_SIZE = 15
GENERATIONS = 100
MUTATION_RATE = 0.1
tokenizer=BertTokenizer.from_pretrained(r'C:\Users\DELL\Documents\ML4SCI\basic-bert-tokenizer')
bert=BertModel.from_pretrained('bert-base-uncased')

In [7]:
class TranformerModel(nn.Module):
    def __init__(self,bert_model,vocab_size):
        super(TranformerModel,self).__init__()
        self.bert=bert_model
        self.decoder=nn.Linear(768,vocab_size)
        
    def forward(self,input_ids,attention_mask):
        outputs=self.bert(input_ids=input_ids,attention_mask=attention_mask)
        last_hidden_state=outputs.last_hidden_state
        logits=self.decoder(last_hidden_state)
        return logits
    
def initialize_population(population_size):
    population=[]
    for _ in range(population_size):
        learning_rate=10**random.uniform(-7,-1)
        population.append({'learning_rate':learning_rate})
    return population

def fitness(individual,model,train_data,val_data,device):
    train_inputs=list(train_data['Features'].values)
    train_outputs=list(train_data['Targets'].values)
    val_inputs=list(val_data['Features'].values)
    val_outputs=list(val_data['Targets'].values)
    
    model=TranformerModel(bert,tokenizer.vocab_size).to(device)
    optimizer=AdamW(model.parameters(),lr=individual['learning_rate'])
    criterion=nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    model.train()
    
    for i in range(len(train_inputs)):
        train_input_ids=train_inputs[i]['input_ids'].to(device)
        train_attention_mask=train_inputs[i]['attention_mask'].to(device)
        train_output_ids=train_outputs[i]['input_ids'].to(device)
        
        optimizer.zero_grad()
        logits=model(train_input_ids,train_attention_mask)
        loss=criterion(logits.view(-1,logits.size(-1)),train_output_ids.view(-1))
        loss.backward()
        optimizer.step()
        
    model.eval()
    val_loss=0
    with torch.no_grad():
        for i in range(len(val_inputs)):
            val_input_ids=val_inputs[i]['input_ids'].to(device)
            val_attention_mask=val_inputs[i]['attention_mask'].to(device)
            val_output_ids=val_outputs[i]['input_ids'].to(device)
            
            logits=model(val_input_ids,val_attention_mask)
            loss=criterion(logits.view(-1,logits.size(-1)),val_output_ids.view(-1))
            val_loss+=loss.item()
        
        avg_loss=val_loss/len(val_inputs)
        return avg_loss
    
def selection(population,fitness_scores):
    sorted_population=[p for _,p in sorted(zip(fitness_scores,population))]
    num_parents=len(population)//2
    if num_parents==0  and len(population)>0:
        return [population[0]]
    return sorted_population[:num_parents]

def cross_over(parents):
    children=[]
    for i in range(len(parents),2):
        if i+1<len(parents):
            parent1=parents[i]
            parent2=parents[i+1]
            child1={'learning_rate':(parent1['learning_rate']+parent2['learning_rate'])/2}
            child2={'learning_rate':(parent1['learning_rate']+parent2['learning_rate'])/2}
            children.extend([child1,child2])
    return children

def mutate(individual,mutation_rate):
    if random.random()<mutation_rate:
        individual['learning_rate']*=10**random.uniform(-0.5,0.5)
    return individual

def genetic_algorithm(model,train_data,val_data,device):
    population=initialize_population(POPULATION_SIZE)
    best_individual=None
    best_fitness=float('inf')
    
    for generation in range(GENERATIONS):
        fitness_scores=[fitness(individual,model,train_data,val_data,device) for individual in population]
        
        if min(fitness_scores)<best_fitness:
            best_fitness=min(fitness_scores)
            best_individual=population[fitness_scores.index(min(fitness_scores))]
            
        print(f'Generation {generation+1}, Best Fitness score : {best_fitness}')
        if generation%5==0:
            weights_path=os.path.join(output_dir,f'weights_step_{generation+1}.pth')
            torch.save(model.state_dict(),weights_path)
            print(f'Saved weights at step {generation+1}')
            
        parents=selection(population,fitness_scores)
        children=cross_over(parents)
        mutated=[mutate(child,MUTATION_RATE) for child in children]
        population=parents+mutated
        
    return best_individual

best_parameter=genetic_algorithm(bert,train_data,val_data,device)

print('Best parameter :', best_parameter)

Generation 1, Best Fitness score : 3.8364491727617054
Saved weights at step 1
Generation 2, Best Fitness score : 3.8324600325690374
Generation 3, Best Fitness score : 3.8324600325690374
Generation 4, Best Fitness score : 3.8286254670884876
Generation 5, Best Fitness score : 3.8251503573523626
Generation 6, Best Fitness score : 3.8251503573523626
Saved weights at step 6
Generation 7, Best Fitness score : 3.8251503573523626
Generation 8, Best Fitness score : 3.8251503573523626
Generation 9, Best Fitness score : 3.8238841427697077
Generation 10, Best Fitness score : 3.8238841427697077
Generation 11, Best Fitness score : 3.8238841427697077
Saved weights at step 11
Generation 12, Best Fitness score : 3.823054472605387
Generation 13, Best Fitness score : 3.823054472605387
Generation 14, Best Fitness score : 3.823054472605387
Generation 15, Best Fitness score : 3.823054472605387
Generation 16, Best Fitness score : 3.8219240771399603
Saved weights at step 16
Generation 17, Best Fitness score :

In [8]:
train_inputs=list(train_data['Features'].values)
train_outputs=list(train_data['Targets'].values)
val_inputs=list(val_data['Features'].values)
val_outputs=list(val_data['Targets'].values)


class TranformerModel(nn.Module):
    def __init__(self,bert_model,vocab_size):
        super(TranformerModel,self).__init__()
        self.bert=bert_model
        self.decoder=nn.Linear(768,vocab_size)
        
    def forward(self,input_ids,attention_mask):
        outputs=self.bert(input_ids=input_ids,attention_mask=attention_mask)
        last_hidden_state=outputs.last_hidden_state
        logits=self.decoder(last_hidden_state)
        return logits
    
model=TranformerModel(bert,tokenizer.vocab_size)
learning_rate=0.003849405145532908
optimizer=AdamW(model.parameters(),lr=learning_rate)
criterion=nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

epochs=100
model.to(device)

for epoch in range(0,epochs):
    model.train()
    total_loss=0
    for i in range(0,len(train_inputs)):
        train_input_ids=train_inputs[i]['input_ids'].to(device)
        train_attention_mask = train_inputs[i]['attention_mask'].to(device)
        train_output_ids = train_outputs[i]['input_ids'].to(device)
        
        optimizer.zero_grad()
        logits=model(train_input_ids,train_attention_mask)
        loss=criterion(logits.view(-1,logits.size(-1)),train_output_ids.view(-1))
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    
    avg_loss=total_loss/len(train_inputs)
    print(f'Epoch :{epoch}----> Train_loss :{avg_loss:.5f}')
    if epoch%50==0:
        weights_path=os.path.join(output_dir,f'weights_step_{epoch+1}.pth')
        torch.save(model.state_dict(),weights_path)
        print(f'Saved weights at step {epoch+1}')
    
    model.eval()
    val_loss=0
    with torch.no_grad():
        for i in range(0,len(val_inputs)):
            val_input_ids=val_inputs[i]['input_ids'].to(device)
            val_attention_mask = val_inputs[i]['attention_mask'].to(device)
            val_output_ids = val_outputs[i]['input_ids'].to(device)
            
            logits=model(val_input_ids,val_attention_mask)
            loss=criterion(logits.view(-1,logits.size(-1)),val_output_ids.view(-1))
            val_loss+=loss.item()
            
        avg_val_loss=val_loss/len(val_inputs)
        print(f'              Validation_loss :{avg_val_loss:.5f}')

Epoch :0----> Train_loss :4.65470
Saved weights at step 1
              Validation_loss :3.81999
Epoch :1----> Train_loss :3.56730
              Validation_loss :3.79237
Epoch :2----> Train_loss :3.52157
              Validation_loss :3.80559
Epoch :3----> Train_loss :3.50704
              Validation_loss :3.81196
Epoch :4----> Train_loss :3.49577
              Validation_loss :3.80949
Epoch :5----> Train_loss :3.48544
              Validation_loss :3.80327
Epoch :6----> Train_loss :3.47463
              Validation_loss :3.80229
Epoch :7----> Train_loss :3.47286
              Validation_loss :3.78951
Epoch :8----> Train_loss :3.46633
              Validation_loss :3.78267
Epoch :9----> Train_loss :3.45782
              Validation_loss :3.81319
Epoch :10----> Train_loss :3.45928
              Validation_loss :3.83287
Epoch :11----> Train_loss :3.45834
              Validation_loss :3.81172
Epoch :12----> Train_loss :3.45317
              Validation_loss :3.80009
Epoch :13----> Train_los

In [9]:
import joblib
model_dir=r'C:\Users\DELL\Documents\ML4SCI\bert-with-genalgo-model.joblib'
joblib.dump(model,model_dir)

['C:\\Users\\DELL\\Documents\\ML4SCI\\bert-with-genalgo-model.joblib']