In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader,TensorDataset
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from transformers import BertTokenizer,BertForSequenceClassification
import random

In [5]:
data=pd.read_csv(r'C:\Users\DELL\Documents\MAJOR PROJECT\train_preprocessed.csv')

In [6]:
data.shape

(99887, 3)

In [None]:
data=data.sample(1000)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99887 entries, 0 to 99886
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   similarity  99887 non-null  int64 
 1   sentence1   99887 non-null  object
 2   sentence2   99884 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [8]:
data=data.dropna()

In [9]:
data.isnull().sum()

similarity    0
sentence1     0
sentence2     0
dtype: int64

In [10]:
training_data=[(row['sentence1'],row['sentence2']) for index,row in data.iterrows()]

In [11]:
model_name='bert-base-uncased'
tokenizer=BertTokenizer.from_pretrained(model_name)
model=BertForSequenceClassification.from_pretrained(model_name,num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def tokenization(sent1,sent2):
  encoded=tokenizer.encode_plus(
      sent1,sent2,
      add_special_tokens=True,
      padding=True,
      truncation=True,
      return_tensors='pt'
  )
  input_ids=encoded['input_ids']
  attention_masks=encoded['attention_mask']
  return input_ids,attention_masks

input_ids=[]
attention_masks=[]
for sent1,sent2 in training_data:
  ids,masks=tokenization(sent1,sent2)
  input_ids.append(ids[0])
  attention_masks.append(masks[0])

input_ids=pad_sequence(input_ids,batch_first=True)
attention_masks=pad_sequence(attention_masks,batch_first=True)
similarity_tensor=torch.tensor(data['similarity'].values)
dataset=TensorDataset(input_ids,attention_masks,similarity_tensor)
dataloader=DataLoader(dataset,batch_size=1)

In [13]:
len(dataloader)

99884

In [14]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
POPULATION_SIZE = 10
GENERATIONS = 3
MUTATION_RATE = 0.1

In [16]:
def initialize_population(population_size):
  population=[]
  for _ in range(population_size):
      learning_rate=(random.uniform(1,9))*10**-5
      population.append({'learning_rate':learning_rate})
  return population

def fitness(individual,model,dataloader,device):
    optimizer=AdamW(model.parameters(),lr=individual['learning_rate'])
    #criterion=nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    model.train()
    total_loss=0
    for batch in dataloader:
        input_ids,attention_masks,labels=batch
        input_ids,attention_masks,labels=input_ids.to(device),attention_masks.to(device),labels.to(device)
        outputs=model(input_ids,attention_mask=attention_masks,labels=labels)
        loss=outputs.loss
        total_loss+=loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    loss=total_loss/len(dataloader)
    return loss

def selection(population,fitness_scores):
    sorted_population=[p for _,p in sorted(zip(fitness_scores,population))]
    num_parents=len(population)//2
    if num_parents==0  and len(population)>0:
        return [population[0]]
    return sorted_population[:num_parents]

def cross_over(parents):
    children=[]
    for i in range(len(parents),2):
        if i+1<len(parents):
            parent1=parents[i]
            parent2=parents[i+1]
            child1={'learning_rate':(parent1['learning_rate']+parent2['learning_rate'])/2}
            child2={'learning_rate':(parent1['learning_rate']+parent2['learning_rate'])/2}
            children.extend([child1,child2])
    return children

def mutate(individual,mutation_rate):
    if random.random()<mutation_rate:
        individual['learning_rate']*=10**random.uniform(-0.5,0.5)
    return individual

def genetic_algorithm(model,dataloader,device):
    population=initialize_population(POPULATION_SIZE)
    best_individual=None
    best_fitness=float('inf')

    for generation in range(GENERATIONS):
        fitness_scores=[fitness(individual,model,dataloader,device) for individual in population]

        if min(fitness_scores)<best_fitness:
            best_fitness=min(fitness_scores)
            best_individual=population[fitness_scores.index(min(fitness_scores))]

        print(f'Generation {generation+1}, Best Fitness score : {best_fitness}')
        parents=selection(population,fitness_scores)
        children=cross_over(parents)
        mutated=[mutate(child,MUTATION_RATE) for child in children]
        population=parents+mutated

    return best_individual

best_parameter=genetic_algorithm(model,dataloader,device)

print('Best parameter :', best_parameter)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 

In [None]:
''' optimizer=AdamW(model.parameters(),lr=best_parameter['learning_rate])
epochs=100
for epoch in range(epochs):
  model.train()
  total_loss=0
  for batch in dataloader:
    input_ids,attention_masks,labels=batch
    input_ids,attention_masks,labels=input_ids.to(device),attention_masks.to(device),labels.to(device)
    outputs=model(input_ids,attention_mask=attention_masks,labels=labels)
    loss=outputs.loss
    total_loss+=loss.item()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  loss=total_loss/len(dataloader)
  print('Epoch : ',epoch+1,'----> Loss : ',loss)'''

In [None]:
#model.save_pretrained('fine-tuned-bert3')