In [1]:
import torch 
import transformers 
device = "cuda" if torch.cuda.is_available() else "cpu"
device


  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [2]:
import pandas as pd 

df = pd.read_csv("data/dataprocesed.csv")
labels = df["Issue Type"].unique()
labels = {labels[i]:i for i in range(len(labels))}
df['Issue Type']=df['Issue Type'].apply(lambda x:labels[x])
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)




In [3]:
import numpy as np 
MAX_LEN = 128
input_ids = []
train_labels = df["Issue Type"].to_numpy().astype(int)
chunk_labels = []

for i in range(len(df)):
    for j in range(1,5):
        col="JuicePath."+str(j)
        if isinstance(df[col][i],str):
            encoded_sent = tokenizer.encode(df[col][i],add_special_tokens=True)
            label = train_labels[i]
            if len(encoded_sent)>MAX_LEN:
                encoded_sent=encoded_sent[1:-1]
                chunk_len = MAX_LEN-2
                for k in range(0,len(encoded_sent),chunk_len):
                    tokens = encoded_sent[k:k+chunk_len]
                    chunk = [tokenizer.cls_token_id] + tokens + [tokenizer.sep_token_id]
                    input_ids.append(chunk)
                    chunk_labels.append(label)
            else:
                input_ids.append(encoded_sent)
                chunk_labels.append(label)
print("DONE!~")
print(f"{len(input_ids)} total logfiles!")



Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


DONE!~
14751 total logfiles!


In [4]:
assert(len(input_ids)==len(chunk_labels))

In [5]:
from keras_preprocessing.sequence import pad_sequences

input_ids = pad_sequences(input_ids,
                          maxlen=MAX_LEN,
                          dtype="long",
                          value=tokenizer.pad_token_id,
                          truncating="post",
                          padding="post")
input_ids.shape

(14751, 128)

In [6]:
attention_masks = []
for sent in input_ids:
    att_mask = [int(x>0) for x in sent]
    attention_masks.append(att_mask)


In [7]:
from sklearn.model_selection import train_test_split

train_inputs ,validation_inputs ,train_labels  , validation_labels = train_test_split(input_ids,chunk_labels,random_state=999,test_size=0.05)
train_masks , validation_masks , _ , _ = train_test_split(attention_masks,attention_masks,random_state=999,test_size=0.05)

In [8]:
train_inputs=torch.tensor(train_inputs)
validation_inputs=torch.tensor(validation_inputs)
train_labels=torch.tensor(train_labels)
validation_labels=torch.tensor(validation_labels)
train_masks=torch.tensor(train_masks)
validation_masks=torch.tensor(validation_masks)

In [9]:
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler

batch_size = 32 

train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=batch_size)
validation_data = TensorDataset(validation_inputs,validation_masks,validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,
                                   sampler=validation_sampler,
                                   batch_size=batch_size)

In [10]:
from transformers import BertForSequenceClassification,AdamW,BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(labels),
    output_attentions=False,
    output_hidden_states = False
)
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-8)
from transformers import get_linear_schedule_with_warmup 

epochs = 10 

total_steps = len(train_dataloader)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)



In [12]:
def flat_accuracy(preds,labels):
    pred_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat==labels_flat)/len(labels_flat)

In [13]:
import time 
import datetime

def format_time(elapsed):
    elapsed_rounded=int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [14]:
import random 

seed_val = 24242

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0,epochs):
    print("")
    print(f"======== Epoch {epoch_i+1} / {epochs} ========")
    print('Training...')

    t0 = time.time()
    total_loss = 0
    model.train()
    for step,batch in enumerate(train_dataloader):
        if step%10==0 and not step==0:
            elapsed = format_time(time.time()-t0)
            print(f"Batch {step} of {len(train_dataloader)} . Elapsed : {elapsed}")
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].type(torch.LongTensor).to(device)

        model.zero_grad()
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)
        loss = outputs[0]
        total_loss+=loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)  
    print("")
    print(f" Average training Loss : {avg_train_loss:.2f}")
    print(f"Training epoch took : {format_time(time.time()-t0)}")

    print("")
    print("Validation...")
    t0=time.time()

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in train_dataloader:
        batch = tuple(x.to(device) for x in batch)
        b_input_ids , b_input_mask , b_labels = batch
        b_labels = b_labels.type(torch.LongTensor).to(device)
        with torch.no_grad():
            outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        tmp_eval_accuracy = flat_accuracy(logits,label_ids)
        eval_accuracy+=tmp_eval_accuracy
        nb_eval_steps+=1
    print(f"Accuracy : {eval_accuracy/nb_eval_steps:.2f}")
    print(f"Validation took : {format_time(time.time()-t0)}")
print("")
print("Training complete!")
                    


Training...
Batch 10 of 438 . Elapsed : 0:00:04
Batch 20 of 438 . Elapsed : 0:00:07
Batch 30 of 438 . Elapsed : 0:00:10
Batch 40 of 438 . Elapsed : 0:00:13
Batch 50 of 438 . Elapsed : 0:00:16
Batch 60 of 438 . Elapsed : 0:00:18
Batch 70 of 438 . Elapsed : 0:00:21
Batch 80 of 438 . Elapsed : 0:00:24
Batch 90 of 438 . Elapsed : 0:00:27
Batch 100 of 438 . Elapsed : 0:00:30
Batch 110 of 438 . Elapsed : 0:00:32
Batch 120 of 438 . Elapsed : 0:00:35
Batch 130 of 438 . Elapsed : 0:00:38
Batch 140 of 438 . Elapsed : 0:00:41
Batch 150 of 438 . Elapsed : 0:00:44
Batch 160 of 438 . Elapsed : 0:00:46
Batch 170 of 438 . Elapsed : 0:00:49
Batch 180 of 438 . Elapsed : 0:00:52
Batch 190 of 438 . Elapsed : 0:00:55
Batch 200 of 438 . Elapsed : 0:00:58
Batch 210 of 438 . Elapsed : 0:01:01
Batch 220 of 438 . Elapsed : 0:01:03
Batch 230 of 438 . Elapsed : 0:01:06
Batch 240 of 438 . Elapsed : 0:01:09
Batch 250 of 438 . Elapsed : 0:01:12
Batch 260 of 438 . Elapsed : 0:01:15
Batch 270 of 438 . Elapsed : 0:01:

In [15]:
import os 

output_dir = "./model_save/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print(f"Saving to {output_dir}")

model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving to ./model_save/


('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.txt',
 './model_save/added_tokens.json')