Importing libraries

In [1]:
## Import required libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from transformers import BertTokenizer, BertModel, AdamW
import warnings
warnings.filterwarnings('ignore')

Storing model weights

In [7]:
## Make a directory for storing model weights
output_dir=r'C:\Users\DELL\Documents\ML4SCI\basic-model-weights'
os.makedirs(output_dir,exist_ok=True)

Data Preprocessing

In [2]:
## Load target dataset
def load_data(df):
    df=pd.read_csv(df)
    return df

## Map Features with corresponding filenames in target dataset and form a feature data
def map_features_to_names(feature_paths,file_names):
  features=[]

  file_names=list(file_names)

  for filename in os.listdir(feature_paths):
    file_path=os.path.join(feature_paths,filename)
    if os.path.isfile(file_path):
      if filename in file_names:
        features.append({'Filepath':file_path,'Filename':filename})
  return pd.DataFrame(features)

## Join feature data with target data
def get_raw_train_data(feature_data,df,filename,input,target):
    joined_df=pd.merge(feature_data,df,on=filename,how='inner')
    train_df=joined_df[[input,target]]
    return train_df

## Extract content from feature files
def preprocess_inputs(input_file):
   file=open(input_file,'r')
   inp=file.read()
   return inp

In [3]:
Feynman_with_units_path=r'C:\Users\DELL\Documents\ML4SCI\FeynmanEquations.csv'
data=load_data(Feynman_with_units_path)
data.shape

(130, 35)

In [4]:
Features_path=r'C:\Users\DELL\Documents\ML4SCI\Feynman_with_units\Feynman_with_units'
file_names=data['Filename']
data_with_feats=map_features_to_names(Features_path,file_names)
data_with_feats.head()

Unnamed: 0,Filepath,Filename
0,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,I.10.7
1,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,I.11.19
2,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,I.12.1
3,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,I.12.11
4,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,I.12.2


In [5]:
training_data=get_raw_train_data(data_with_feats,data,'Filename','Filepath','Formula')
training_data.head()

Unnamed: 0,Filepath,Formula
0,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,m_0/sqrt(1-v**2/c**2)
1,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,x1*y1+x2*y2+x3*y3
2,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,mu*Nn
3,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,q*(Ef+B*v*sin(theta))
4,C:\Users\DELL\Documents\ML4SCI\Feynman_with_un...,q1*q2*r/(4*pi*epsilon*r**3)


In [6]:
training_data['Filepath']=training_data['Filepath'].map(preprocess_inputs)
training_data.head()

Unnamed: 0,Filepath,Formula
0,1.6464076172823914 1.5889609877804642 5.755668...,m_0/sqrt(1-v**2/c**2)
1,1.6823171277495557 2.849193436404849 2.5054347...,x1*y1+x2*y2+x3*y3
2,1.1136223150728854 4.237022171434605 4.7184424...,mu*Nn
3,1.6988691007665642 1.3752399346386226 2.472610...,q*(Ef+B*v*sin(theta))
4,3.5129800991097357 4.924224713199264 4.0911408...,q1*q2*r/(4*pi*epsilon*r**3)


In [7]:
training_data.to_csv('training_data.csv',index=None)

Loading Tokenizer and Transformer Model

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')## Load BERT Tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')## Load BERT Model

Apply Tokenization

In [10]:
## Tokenization
def tokenize(target_data):
  target_encodings=tokenizer(
      target_data,
      max_length=128,
      padding='max_length',
      truncation=True,
      return_tensors='pt'
  )
  return target_encodings

## Apply tokenization and get data containing features and targets
feats=[]## Features list
targets=[]## Targets list
data=pd.DataFrame()## DataFrame for features and targets
for i in range(0,len(training_data['Formula'])):
     inp_encoded=tokenize(training_data['Filepath'][i])## Tokenize features
     out_encoded=tokenize(training_data['Formula'][i])## Tokenize targets
     #target=target.flatten()
     feats.append(inp_encoded)
     targets.append(out_encoded)
     print(f'Done : {i}')
data['Features']=feats
data['Targets']=targets
print(data.shape)

Done : 0
Done : 1
Done : 2
Done : 3
Done : 4
Done : 5
Done : 6
Done : 7
Done : 8
Done : 9
Done : 10
Done : 11
Done : 12
Done : 13
Done : 14
Done : 15
Done : 16
Done : 17
Done : 18
Done : 19
Done : 20
Done : 21
Done : 22
Done : 23
Done : 24
Done : 25
Done : 26
Done : 27
Done : 28
Done : 29
Done : 30
Done : 31
Done : 32
Done : 33
Done : 34
Done : 35
Done : 36
Done : 37
Done : 38
Done : 39
Done : 40
Done : 41
Done : 42
Done : 43
Done : 44
Done : 45
Done : 46
Done : 47
Done : 48
Done : 49
Done : 50
Done : 51
Done : 52
Done : 53
Done : 54
Done : 55
Done : 56
Done : 57
Done : 58
Done : 59
Done : 60
Done : 61
Done : 62
Done : 63
Done : 64
Done : 65
Done : 66
Done : 67
Done : 68
Done : 69
Done : 70
Done : 71
Done : 72
Done : 73
Done : 74
Done : 75
Done : 76
Done : 77
Done : 78
Done : 79
Done : 80
Done : 81
Done : 82
Done : 83
Done : 84
Done : 85
Done : 86
Done : 87
Done : 88
Done : 89
Done : 90
Done : 91
Done : 92
Done : 93
Done : 94
Done : 95
Done : 96
(97, 2)


Splitting of prepared data

In [11]:
## Splitting data into training data, validation data and testing data
train_data,test_data=train_test_split(data,test_size=0.1,random_state=42)## Data into Training data and Testing data
train_data,val_data=train_test_split(train_data,test_size=0.1,random_state=42)## Training data into Training data and Validation data
print(f'Training data shape:{train_data.shape}, Validation data shape:{val_data.shape}, Testing data shape:{test_data.shape}')

Training data shape:(78, 2), Validation data shape:(9, 2), Testing data shape:(10, 2)


In [None]:
## Convert features and targets into separate lists for training
train_inputs = list(train_data['Features'])## Training data features
train_outputs = list(train_data['Targets'])## Training data targets
val_inputs = list(val_data['Features'])## Validation data features
val_outputs = list(val_data['Targets'])## Validation data targets
print(type(train_inputs[0]))
print(type(val_inputs[0]))
print(type(train_outputs[0]))
print(type(val_outputs[0]))

<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>


Model Training and Evaluation

In [12]:
class TranformerModel(nn.Module):
    def __init__(self,bert_model,vocab_size):
        super(TranformerModel,self).__init__()
        self.bert=bert_model
        self.decoder=nn.Linear(768,vocab_size)
        
    def forward(self,input_ids,attention_mask):
        outputs=self.bert(input_ids=input_ids,attention_mask=attention_mask)
        last_hidden_state=outputs.last_hidden_state
        logits=self.decoder(last_hidden_state)
        return logits
    
model=TranformerModel(bert_model,tokenizer.vocab_size)
optimizer=AdamW(model.parameters(),lr=0.01)
criterion=nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

epochs=100
model.to(device)

for epoch in range(0,epochs):
    model.train()
    total_loss=0
    for i in range(0,len(train_inputs)):
        train_input_ids=train_inputs[i]['input_ids'].to(device)
        train_attention_mask = train_inputs[i]['attention_mask'].to(device)
        train_output_ids = train_outputs[i]['input_ids'].to(device)
        
        optimizer.zero_grad()
        logits=model(train_input_ids,train_attention_mask)
        loss=criterion(logits.view(-1,logits.size(-1)),train_output_ids.view(-1))
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    
    avg_loss=total_loss/len(train_inputs)
    print(f'Epoch :{epoch}----> Train_loss :{avg_loss:.5f}')
    if epoch%50==0:
        weights_path=os.path.join(output_dir,f'weights_step_{epoch+1}.pth')
        torch.save(model.state_dict(),weights_path)
        print(f'Saved weights at step {epoch+1}')
    
    model.eval()
    val_loss=0
    with torch.no_grad():
        for i in range(0,len(val_inputs)):
            val_input_ids=val_inputs[i]['input_ids'].to(device)
            val_attention_mask = val_inputs[i]['attention_mask'].to(device)
            val_output_ids = val_outputs[i]['input_ids'].to(device)
            
            logits=model(val_input_ids,val_attention_mask)
            loss=criterion(logits.view(-1,logits.size(-1)),val_output_ids.view(-1))
            val_loss+=loss.item()
            
        avg_val_loss=val_loss/len(val_inputs)
        print(f'              Validation_loss :{avg_val_loss:.5f}')

Epoch :0----> Train_loss :4.42942
Saved weights at step 1
              Validation_loss :3.82955
Epoch :1----> Train_loss :3.59224
              Validation_loss :3.83394
Epoch :2----> Train_loss :3.56773
              Validation_loss :3.79925
Epoch :3----> Train_loss :3.52066
              Validation_loss :3.80022
Epoch :4----> Train_loss :3.50960
              Validation_loss :3.80976
Epoch :5----> Train_loss :3.50734
              Validation_loss :3.79178
Epoch :6----> Train_loss :3.49161
              Validation_loss :3.78172
Epoch :7----> Train_loss :3.48461
              Validation_loss :3.82335
Epoch :8----> Train_loss :3.47976
              Validation_loss :3.79102
Epoch :9----> Train_loss :3.47293
              Validation_loss :3.77926
Epoch :10----> Train_loss :3.45455
              Validation_loss :3.82262
Epoch :11----> Train_loss :3.46036
              Validation_loss :3.81900
Epoch :12----> Train_loss :3.45217
              Validation_loss :3.78189
Epoch :13----> Train_los

Saving tokenizer 

In [15]:
save_dir=r'C:\Users\DELL\Documents\ML4SCI\basic-bert-tokenizer'
tokenizer.save_pretrained(save_dir)

('C:\\Users\\DELL\\Documents\\ML4SCI\\basic-bert-tokenizer\\tokenizer_config.json',
 'C:\\Users\\DELL\\Documents\\ML4SCI\\basic-bert-tokenizer\\special_tokens_map.json',
 'C:\\Users\\DELL\\Documents\\ML4SCI\\basic-bert-tokenizer\\vocab.txt',
 'C:\\Users\\DELL\\Documents\\ML4SCI\\basic-bert-tokenizer\\added_tokens.json')

Model Saving

In [18]:
import joblib
model_dir=r'C:\Users\DELL\Documents\ML4SCI\basic-bert-model.joblib'
joblib.dump(model,model_dir)

['C:\\Users\\DELL\\Documents\\ML4SCI\\basic-bert-model.joblib']

Saving tokenized data

In [None]:
torch.save(train_data,'tokenized_train.pt')
torch.save(test_data,'tokenized_test.pt')
torch.save(val_data,'tokenized_val.pt')

Prediction

In [None]:
def predict(model,input):
    model.eval()
    #input_tokens=tokenizer(
        #input,
        #max_length=128,
        #padding='max_length',
        #truncation=True,
        #return_tensors='pt'
    #).to(device)
    
    with torch.no_grad():
        logits=model(input['input_ids'].to(device),input['attention_mask'].to(device))
        predicted_tokens=torch.argmax(logits,dim=-1)
        return predicted_tokens

test_inputs=list(test_data['Features'])
test_outputs=list(test_data['Targets'])

for i in range(0,len(test_inputs)):
    pred=predict(model,test_inputs[i])
    print(torch.equal(pred,test_outputs[i]['input_ids']))

False
False
False
False
False
False
False
False
False
False
