In [4]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel,BertPreTrainedModel,BertTokenizer,AdamW, get_linear_schedule_with_warmup
import numpy as np
import time
import os
import torch.nn as nn
global EPOCHS, BATCH_SIZE_RATIO, SEQUENCE_LEN, LEARNING_RATE, TOKENIZER, MODEL_NAME
import torch.nn.functional as F
from openpyxl import load_workbook
from sklearn.preprocessing import MinMaxScaler


In [7]:
projectnum = 2 # serve apenas pra escrever em open.xlsx que tem os resultados comparativos entre projetos
ROW_MAE, ROW_MMRE, ROW_PRED = 3,4,5

RESULT_FILE_PATH = r'./open.xlsx'



wb = load_workbook(RESULT_FILE_PATH)
sheet = wb.active
EPOCHS = 20
BATCH_SIZE_RATIO = 0.3
SEQUENCE_LEN = 20
LEARNING_RATE = 5e-4
# define device
global DEVICE
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# define files to be used
DATA_FILE = './open/issues_sp_filtered_with_comments.csv'  



# projetos da nova base
TRAIN_PROJECTS = [4, 12, 43, 36] 


OUTPUT = ''
MODEL = None
DYNAMIC_BATCH = True
BATCH_SIZE = None
WITHIN_PROJECT = None
MAE_RECORDS = []
MDAE_RECORDS = []

def data_processing(project_data):
    global BATCH_SIZE, BATCH_SIZE_RATIO, WITHIN_PROJECT, DYNAMIC_BATCH

    
    # Garantir que os dados não estejam vazios
    if project_data.empty:
        print("Nenhum dado encontrado para o projeto.")
        return None, None, None


    train_data = project_data.copy()  

        
    # data split
    if WITHIN_PROJECT:
        train_ex,train_text,train_labels,val_ex,val_text, val_labels, test_ex,test_text, test_labels = within_project_split(train_data)
    # define batch size dynamicalloutputsy based on training length
    if DYNAMIC_BATCH:
        BATCH_SIZE = int(len(train_text) * BATCH_SIZE_RATIO)
    # tokenization
    tokens_train = tokenization(train_text.tolist())
    tokens_val = tokenization(val_text.tolist())
 
    train_seq = torch.tensor(tokens_train['input_ids'])
    train_ex = np.array(train_ex)
    train_ex = torch.tensor(train_ex)
    train_y = torch.tensor(train_labels.tolist()).type(torch.FloatTensor)
    train_seq = torch.cat((train_ex,train_seq),dim=1)
    train_dataloader = prepare_dataloader(train_seq, train_y, sampler_type='random')

    val_seq = torch.tensor(tokens_val['input_ids'])
    val_ex = np.array(val_ex)
    val_ex = torch.tensor(val_ex)
    val_y = torch.tensor(val_labels.tolist()).type(torch.FloatTensor)
    val_seq = torch.cat((val_ex,val_seq),dim=1)
    val_dataloader = prepare_dataloader(val_seq, val_y, sampler_type='sequential')
    
    # prepare testing datasets
    all_test_dataloader = []
    # test_file_names = []
    if WITHIN_PROJECT:
        tokens_test = tokenization(test_text.tolist())
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_ex = np.array(test_ex)
        test_ex = torch.tensor(test_ex)
        test_seq = torch.cat((test_ex,test_seq),dim=1)
        test_y = torch.tensor(test_labels.tolist()).type(torch.FloatTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')
        all_test_dataloader.append(test_dataloader)
        # test_file_names.append(file_pair)
        return train_dataloader, val_dataloader, all_test_dataloader


def tokenization(text_list):
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    return tokenizer(text_list, truncation=True, max_length=SEQUENCE_LEN, padding='max_length')


def prepare_dataframe(file_name):
    data = pd.read_csv(file_name)
    # order=['Assignee_count','Reporter_count','Creator_count','Summary','Custom field (Story Points)']
    order = ['Project_ID','Reputation_t', 'Title', 'Story_Point']

    data=data[order]
    data = data.fillna(0)
    return pd.DataFrame(data=data)


def prepare_dataloader(seq, y, sampler_type):
    global BATCH_SIZE
    tensor_dataset = TensorDataset(seq, y)
    if sampler_type == 'random':
        sampler = RandomSampler(tensor_dataset)
    elif sampler_type == 'sequential':
        sampler = SequentialSampler(tensor_dataset)
    dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader


def within_project_split(data):
    print('within project split!')
    train_val_split_point = int(len(data) * 0.6)
    val_test_split_point = int(len(data) * 0.8)
    train_ex=data.iloc[:train_val_split_point,0:1]
    train_text = data['Title'][:train_val_split_point]
    train_labels = (data['Story_Point'][:train_val_split_point])
    val_ex=data.iloc[train_val_split_point:val_test_split_point,0:1]
    val_text = data['Title'][train_val_split_point:val_test_split_point]
    val_labels = (data['Story_Point'][train_val_split_point:val_test_split_point])
    test_ex=data.iloc[val_test_split_point:,0:1]
    test_text = data['Title'][val_test_split_point:]
    test_labels = (data['Story_Point'][val_test_split_point:])
    return train_ex,train_text,train_labels,val_ex,val_text, val_labels, test_ex,test_text, test_labels

class BertForSequence(nn.Module):
    def __init__(self):
        super(BertForSequence, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        for name, param in self.bert.named_parameters():
            param.requires_grad = False
        self.hidden1=nn.Linear(768, 3)
        self.hidden2=nn.Linear(4,50)
        self.score = nn.Linear(50, 1)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        outputs_bert = self.bert(input_ids[:,1:].long(), token_type_ids, attention_mask)
        outputs = outputs_bert.last_hidden_state[:,0,:]
        outputs=self.hidden1(outputs)
        outputs=torch.cat((input_ids[:,0:1],outputs),dim=1)
        outputs=torch.relu(self.hidden2(outputs.float()))
        logit = self.score(outputs)
        return logit

def train_eval_test(project_id, train_dataloader, val_dataloader, all_test_dataloader, model):
    global LEARNING_RATE, EPOCHS, MAE_RECORDS, MDAE_RECORDS, DEVICE
    optimizer = AdamW(MODEL.parameters(), lr=LEARNING_RATE)    
    # Total number of training steps is [number of batches] x [number of epochs]
    total_steps = len(train_dataloader) * EPOCHS
    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    print(f"Start training for Project {project_id}...")
    training_start_time = time.time()
    

    min_eval_loss_epoch = [10000, 0]
    
    time_records = []
    MAE_RECORDS = []
    MDAE_RECORDS = []
    MMRE_RECORDS = []
    PRED_RECPRDS=[]
    start_time = time.time()
    loss_fct = nn.L1Loss()
    for e in range(EPOCHS):
        # ---TRAINING---
        # clean GPU memory
        torch.cuda.empty_cache()
        print(">>> epoch ", e)
        # set model into train mode
        model.train()
        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):            
            b_input_ids = batch[0].to(DEVICE)
            b_labels = batch[1].to(DEVICE)
            model.zero_grad()
            result = model(b_input_ids, 
                           labels=b_labels,
                           )
            # loss = loss_fct(result,b_labels)
            loss = loss_fct(result.view(-1), b_labels)

            logits=result
            total_train_loss += loss.item()  
            loss.backward() 
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            # clean memory
            del step, batch, b_input_ids, b_labels, result, loss, logits

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(" Average training MAE loss: {0:.2f}".format(avg_train_loss))
        # clean memory
        del avg_train_loss, total_train_loss
        
        time_records.append(time.time() - start_time)
        
        # ---EVAL---
        print("-")
        # set model into eval mode
        model.eval()
        total_eval_loss = 0
        for batch in val_dataloader:            
            b_input_ids = batch[0].to(DEVICE)
            b_labels = batch[1].to(DEVICE)
            model.zero_grad()
            result = model(b_input_ids, 
                           labels=b_labels,
                           )
            # loss = loss_fct(result,b_labels)
            loss = loss_fct(result.view(-1), b_labels)

            logits = result
            total_eval_loss += loss.item()  
            # clean memory
            del b_input_ids, b_labels, batch, result, loss, logits
        avg_eval_loss = total_eval_loss / len(val_dataloader)
        print(" Average eval MAE loss: {0:.2f}".format(avg_eval_loss))
        
        if avg_eval_loss <= min_eval_loss_epoch[0]:
            min_eval_loss_epoch[0] = avg_eval_loss
            min_eval_loss_epoch[1] = e
        
        # clean memory
        del avg_eval_loss, total_eval_loss
        # save model state to dict
        
        print("===============================")
        
        # testing on holdout data
        # index = 0
        for test_dataloader in all_test_dataloader:
            # test_file_name = test_file_names[index]
            # index += 1
            testing_start_time = time.time()
            predictions = []
            true_labels = []
            for batch in test_dataloader:
                batch = tuple(t.to(DEVICE) for t in batch)
                b_input_ids, b_labels = batch
                with torch.no_grad():
                    logits = model(b_input_ids)
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                predictions.append(logits)
                true_labels.append(label_ids)
            # calculate errors
            total_distance = 0
            total_mre = 0
            m=0
            distance_records = []
            total_data_point=0
            for i in range(len(predictions)):
                total_data_point+=len(predictions[i])
            for i in range(len(predictions)):
                for j in range(len(predictions[i])):
                    distance = abs(predictions[i][j] - true_labels[i][j])
                    if(true_labels[i][j]>0):
                        mre=abs(predictions[i][j] - true_labels[i][j])/true_labels[i][j]
                    else:
                        mre=(abs(predictions[i][j] - true_labels[i][j])+1)/(true_labels[i][j]+1)
                    if mre<0.5:
                        m+=1
                    total_mre+=mre
                    total_distance += distance
                    distance_records.append(distance)
            MAE = total_distance / total_data_point
            MMRE= total_mre / total_data_point
            MdAE = np.median(np.array(distance_records)) 
            PRED=m/total_data_point
            MAE_RECORDS.append(MAE)
            MDAE_RECORDS.append(MdAE)
            MMRE_RECORDS.append(MMRE)
            PRED_RECPRDS.append(PRED)
            
            global OUTPUT
            OUTPUT +=  'Epochs ' + str(e) + '\n'
            OUTPUT += 'MAE: ' + str(MAE) + '\n'
            OUTPUT += 'MdAE: ' + str(MdAE) + '\n'
            OUTPUT += 'MMRE: ' + str(MMRE) + '\n'
            OUTPUT += 'PRED: ' + str(PRED) + '\n\n'
            print('MAE: ', MAE)
            print('MdAE: ', MdAE)
            print('MMRE: ', MMRE)
            print('PRED: ', PRED)
    
            
    OUTPUT +=str(MAE_RECORDS[min_eval_loss_epoch[1]])  + '\n'+str(MMRE_RECORDS[min_eval_loss_epoch[1]]) + '\n'+ str(PRED_RECPRDS[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'training time: ' + str(time_records[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'Epochs: ' + str(min_eval_loss_epoch[1]) +'\n'
    global BATCH_SIZE
    OUTPUT += 'batch size: ' + str(BATCH_SIZE)
    print('all done for one project')
    sheet.cell(row=ROW_MAE, column=projectnum).value = MAE_RECORDS[min_eval_loss_epoch[1]][0]
    sheet.cell(row=ROW_MMRE, column=projectnum).value = MMRE_RECORDS[min_eval_loss_epoch[1]][0]
    sheet.cell(row=ROW_PRED, column=projectnum).value = PRED_RECPRDS[min_eval_loss_epoch[1]]
    wb.save(RESULT_FILE_PATH)

WITHIN_PROJECT = True


def main():
    global  MODEL, TOKENIZER, MODEL_NAME
  
    # Carregar os dados 
    df = prepare_dataframe(DATA_FILE)
    
    for project_id in TRAIN_PROJECTS:
        project_data = df[df['Project_ID'] == project_id]  # Filtrar dados do projeto

        if project_data.empty:
            print(f"Projeto {project_id} não possui dados suficientes. Pulando...")
            continue

        MODEL = BertForSequence().to(DEVICE)

        train_dataloader, val_dataloader, all_test_dataloader = data_processing(project_data.drop(columns=['Project_ID']))

        train_eval_test(project_id, train_dataloader, val_dataloader, all_test_dataloader, MODEL)

        del MODEL
        torch.cuda.empty_cache()   

        global OUTPUT
        with open('./result_bert/project_' + str(project_id) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""
        global projectnum
        projectnum=projectnum+1

                
if __name__ == "__main__":
    main()

within project split!




Start training for Project 4...
>>> epoch  0
 Average training MAE loss: 2.69
-
 Average eval MAE loss: 1.95
MAE:  [2.381991]
MdAE:  1.8187292
MMRE:  [0.8229866]
PRED:  0.0
>>> epoch  1
 Average training MAE loss: 2.60
-
 Average eval MAE loss: 1.82
MAE:  [2.2486167]
MdAE:  1.6744025
MMRE:  [0.7507763]
PRED:  0.109375
>>> epoch  2
 Average training MAE loss: 2.48
-
 Average eval MAE loss: 1.64
MAE:  [2.0821242]
MdAE:  1.4803218
MMRE:  [0.6606339]
PRED:  0.271875
>>> epoch  3
 Average training MAE loss: 2.29
-
 Average eval MAE loss: 1.46
MAE:  [1.9000305]
MdAE:  1.2934813
MMRE:  [0.56200194]
PRED:  0.29375
>>> epoch  4
 Average training MAE loss: 2.12
-
 Average eval MAE loss: 1.28
MAE:  [1.7287061]
MdAE:  1.1085978
MMRE:  [0.4761893]
PRED:  0.45
>>> epoch  5
 Average training MAE loss: 2.01
-
 Average eval MAE loss: 1.17
MAE:  [1.6310387]
MdAE:  0.93418944
MMRE:  [0.46537656]
PRED:  0.525
>>> epoch  6
 Average training MAE loss: 1.88
-
 Average eval MAE loss: 1.13
MAE:  [1.5532705]
Md



Start training for Project 12...
>>> epoch  0
 Average training MAE loss: 4.86
-
 Average eval MAE loss: 6.81
MAE:  [6.043938]
MdAE:  4.8430643
MMRE:  [0.9596383]
PRED:  0.0
>>> epoch  1
 Average training MAE loss: 4.63
-
 Average eval MAE loss: 6.71
MAE:  [5.9479427]
MdAE:  4.7323604
MMRE:  [0.93051434]
PRED:  0.003738317757009346
>>> epoch  2
 Average training MAE loss: 4.63
-
 Average eval MAE loss: 6.59
MAE:  [5.8457665]
MdAE:  4.617207
MMRE:  [0.8993338]
PRED:  0.026168224299065422
>>> epoch  3
 Average training MAE loss: 4.44
-
 Average eval MAE loss: 6.48
MAE:  [5.741067]
MdAE:  4.5003805
MMRE:  [0.8673422]
PRED:  0.03551401869158879
>>> epoch  4
 Average training MAE loss: 4.38
-
 Average eval MAE loss: 6.36
MAE:  [5.634415]
MdAE:  4.383033
MMRE:  [0.83834463]
PRED:  0.09532710280373832
>>> epoch  5
 Average training MAE loss: 4.26
-
 Average eval MAE loss: 6.24
MAE:  [5.5295634]
MdAE:  4.2625732
MMRE:  [0.81627303]
PRED:  0.09906542056074766
>>> epoch  6
 Average training MAE 



Start training for Project 43...
>>> epoch  0
 Average training MAE loss: 1.85
-
 Average eval MAE loss: 2.04
MAE:  [1.720751]
MdAE:  1.1610298
MMRE:  [1.113127]
PRED:  0.0
>>> epoch  1
 Average training MAE loss: 1.77
-
 Average eval MAE loss: 1.99
MAE:  [1.6713607]
MdAE:  1.1078936
MMRE:  [1.0746052]
PRED:  0.0
>>> epoch  2
 Average training MAE loss: 1.70
-
 Average eval MAE loss: 1.94
MAE:  [1.6179519]
MdAE:  1.050791
MMRE:  [1.0327698]
PRED:  0.0
>>> epoch  3
 Average training MAE loss: 1.68
-
 Average eval MAE loss: 1.90
MAE:  [1.5651531]
MdAE:  1.0012418
MMRE:  [0.99164796]
PRED:  0.0
>>> epoch  4
 Average training MAE loss: 1.62
-
 Average eval MAE loss: 1.85
MAE:  [1.5212239]
MdAE:  0.95731026
MMRE:  [0.9576415]
PRED:  0.0
>>> epoch  5
 Average training MAE loss: 1.60
-
 Average eval MAE loss: 1.81
MAE:  [1.4758015]
MdAE:  0.9118146
MMRE:  [0.92233163]
PRED:  0.0
>>> epoch  6
 Average training MAE loss: 1.56
-
 Average eval MAE loss: 1.77
MAE:  [1.4307456]
MdAE:  0.87223244
MM



Start training for Project 36...
>>> epoch  0
 Average training MAE loss: 4.21
-
 Average eval MAE loss: 3.68
MAE:  [4.437868]
MdAE:  3.057982
MMRE:  [1.0064965]
PRED:  0.0
>>> epoch  1
 Average training MAE loss: 4.15
-
 Average eval MAE loss: 3.66
MAE:  [4.411981]
MdAE:  3.0503898
MMRE:  [0.9982751]
PRED:  0.0
>>> epoch  2
 Average training MAE loss: 4.03
-
 Average eval MAE loss: 3.64
MAE:  [4.387118]
MdAE:  3.0242815
MMRE:  [0.99026084]
PRED:  0.0
>>> epoch  3
 Average training MAE loss: 4.12
-
 Average eval MAE loss: 3.63
MAE:  [4.3649774]
MdAE:  2.9994357
MMRE:  [0.98299366]
PRED:  0.0
>>> epoch  4
 Average training MAE loss: 4.04
-
 Average eval MAE loss: 3.61
MAE:  [4.3426147]
MdAE:  2.978831
MMRE:  [0.97596717]
PRED:  0.0
>>> epoch  5
 Average training MAE loss: 4.09
-
 Average eval MAE loss: 3.60
MAE:  [4.3194556]
MdAE:  2.9601655
MMRE:  [0.9686869]
PRED:  0.0
>>> epoch  6
 Average training MAE loss: 4.01
-
 Average eval MAE loss: 3.58
MAE:  [4.2982674]
MdAE:  2.9396477
MMRE: