In [23]:
%pip install transformers torch accelerate datasets evaluate numpy pandas scikit-learn -r requirements.txt 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mNote: you may need to restart the kernel to use updated packages.


In [17]:
isCuda = False

### Run this if using the gym NVIDIA GPU

In [24]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
isCuda = True
isTraining = True

In [18]:
import pandas as pd
import cleaning_tool as ct

# Load your dataset
df = pd.read_csv('training_set.csv')
df = df.dropna()
df = ct.filter_single_users(dataframe=df, min_pull=5)


In [20]:
from sklearn.preprocessing import LabelEncoder

# Encode the assignee names
label_encoder = LabelEncoder()
df['assignee_encoded'] = label_encoder.fit_transform(df['assignee'])

df['input_text'] = "<#TITLE-START#> " + df['title'] + " <#TITLE-END#> <#BODY-START#> " + df['body'] + " <#BODY-END#>"


# Split into input features (titles) and labels (encoded assignees)
titles = df['input_text'].tolist()

trainingSet = df[df['number'] < 185000]['input_text'].tolist()
evaluationSet = df[ (185000 <= df['number']) & (df['number']< 210000)]['input_text'].tolist()
testSet = df[(210000 <= df['number']) & (df['number']< 220000)]['input_text'].tolist()

labels = df['assignee_encoded'].tolist()

trainingLabels = df[df['number'] < 185000]['assignee_encoded'].tolist()
evaluationLabels = df[ (185000 <= df['number']) & (df['number']< 210000)]['assignee_encoded'].tolist()
testLabels = df[(210000 <= df['number']) & (df['number']< 220000)]['assignee_encoded'].tolist()

print('trainingSet lenght:', len(trainingSet))
print('evaluationSet lenght:', len(evaluationSet))
print('testSet length:', len(testSet))


trainingSet lenght: 110305
evaluationSet lenght: 11926
testSet length: 2850


In [21]:
from transformers import AutoTokenizer

# Choose a model, e.g., 'distilbert-base-uncased'
# model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the input titles
# inputs = tokenizer(titles, padding=True, truncation=True, return_tensors='pt', max_length=128)
# inputs = tokenizer(titles, padding=True, truncation=True, return_tensors='pt', max_length=128).to("cuda")




In [22]:
from datasets import Dataset

def makeDataset(current, labels, isCuda):
    if isCuda:
        inputs = tokenizer(current, padding=True, truncation=True, return_tensors='pt', max_length=128).to("cuda")
    else:
        inputs = tokenizer(current, padding=True, truncation=True, return_tensors='pt', max_length=128)
    
    return Dataset.from_dict({
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'labels': labels
})

trainingSet = makeDataset(trainingSet, trainingLabels, isCuda)
evaluationSet =  makeDataset(evaluationSet, evaluationLabels, isCuda)
testSet =  makeDataset(testSet, testLabels, isCuda)


In [23]:
import evaluate
import numpy as np

# Load metric functions
accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load('f1')

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted', zero_division=0)
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall'],
        'f1': f1['f1'],
    }
    

### Run this to train a new model

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Define training arguments with early stopping and final model saving
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',  # Save at the end of each epoch
    save_total_limit=1,  # Keep only the most recent model
    load_best_model_at_end=True,  # Automatically load the best model at the end
    metric_for_best_model='eval_loss',  # Use validation loss to select the best model
    greater_is_better=False,  # Lower loss is better
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainingSet,
    eval_dataset=evaluationSet,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Early stopping with patience
)

# Train the model
trainer.train()



### Evaluate the produced model

In [None]:
trainer.evaluate(testSet)


### Reload model from file

In [24]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import json

model = AutoModelForSequenceClassification.from_pretrained('./results/checkpoint-14102024')
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
with open("labels_json.json") as f:
    labelsFromJson = json.load(f)
isTraining = False

### Evaluate loaded model

In [25]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',  # Save at the end of each epoch
    save_total_limit=1,  # Keep only the most recent model
    load_best_model_at_end=True,  # Automatically load the best model at the end
    metric_for_best_model='eval_loss',  # Use validation loss to select the best model
    greater_is_better=False,  # Lower loss is better
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainingSet,
    eval_dataset=evaluationSet,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],  # Early stopping with patience
)

trainer.evaluate(testSet)



  0%|          | 0/45 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.25747013092041,
 'eval_model_preparation_time': 0.0007,
 'eval_accuracy': 0.5126315789473684,
 'eval_precision': 0.5207645832636719,
 'eval_recall': 0.5126315789473684,
 'eval_f1': 0.4967896014119016,
 'eval_runtime': 9.6767,
 'eval_samples_per_second': 294.522,
 'eval_steps_per_second': 4.65}

## CUDA

In [32]:
import torch
from scipy.special import softmax

# Check if CUDA is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move the model to the correct device
model.to(device)

def predict_assignee(title, body):
    # Concatenate title and body
    combined_input = title + " " + body
    
    # Tokenize the input
    inputs = tokenizer(combined_input, return_tensors='pt', padding=True, truncation=True, max_length=128)
    
    # Move input tensors to the correct device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get model output
    outputs = model(**inputs)
    
    # Get logits
    logits = outputs.logits.detach().cpu().numpy()[0]
    
    # Get probabilities using softmax
    probabilities = softmax(logits)
    
    # Create a list of (assignee, probability) pairs
    if isTraining:
        assignee_probs = list(zip(label_encoder.classes_, probabilities))
    else:
        assignee_probs = list(zip(labelsFromJson, probabilities))
    
    # Sort by probability in descending order
    ranked_assignees = sorted(assignee_probs, key=lambda x: x[1], reverse=True)
    
    return ranked_assignees

# Example prediction with both title and body
ranked_list = predict_assignee("Side bar always showing when opening VS Code", "When opening VS Code, the sidebar is always visible")

for index, (assignee, probability) in enumerate(ranked_list):
    print(f"{index + 1}: {assignee}: {probability:.2f}")


1: sbatten: 0.75
2: bpasero: 0.15
3: deepak1556: 0.04
4: isidorn: 0.02
5: joaomoreno: 0.01
6: miguelsolorio: 0.01
7: jrieken: 0.00
8: sandy081: 0.00
9: Tyriar: 0.00
10: daviddossett: 0.00
11: lramos15: 0.00
12: eamodio: 0.00
13: mjbvz: 0.00
14: alexr00: 0.00
15: JacksonKearl: 0.00
16: aeschli: 0.00
17: roblourens: 0.00
18: stevencl: 0.00
19: chrmarti: 0.00
20: TylerLeonhardt: 0.00
21: connor4312: 0.00
22: meganrogge: 0.00
23: rzhao271: 0.00
24: alexdima: 0.00
25: lszomoru: 0.00
26: joyceerhl: 0.00
27: rebornix: 0.00
28: egamma: 0.00
29: cleidigh: 0.00
30: chrisdias: 0.00
31: RMacfarlane: 0.00
32: bhavyaus: 0.00
33: bgashler1: 0.00
34: hediet: 0.00
35: weinand: 0.00
36: dbaeumer: 0.00
37: benibenj: 0.00
38: octref: 0.00
39: michelkaporin: 0.00
40: andreamah: 0.00
41: justschen: 0.00
42: seanmcbreen: 0.00
43: tanhakabir: 0.00
44: ramya-rao-a: 0.00
45: ulugbekna: 0.00
46: kieferrm: 0.00
47: DonJayamanne: 0.00
48: gregvanl: 0.00
49: digitarald: 0.00
50: aiday-mar: 0.00
51: danyeh: 0.00
52:

## MACOS

In [3]:
import torch
from scipy.special import softmax

# Check if MPS is available
device = torch.device('mps') if torch.has_mps else torch.device('cpu')

# Move the model to the correct device
model.to(device)

# def predict_assignee(title):
#     # Tokenize the input
#     inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=128)

#     # Move input tensors to the correct device
#     inputs = {key: value.to(device) for key, value in inputs.items()}

#     # Get model output
#     outputs = model(**inputs)

#     # Get the predicted class
#     predicted_class = outputs.logits.argmax(dim=1).item()

#     # Return the assignee name
#     return label_encoder.inverse_transform([predicted_class])[0]

# Example prediction
# print(predict_assignee("Change the name"))

def predict_assignee(title, body):
    # Concatenate title and body
    combined_input = title + " " + body
    
    # Tokenize the input
    inputs = tokenizer(combined_input, return_tensors='pt', padding=True, truncation=True, max_length=128)
    
    # Move input tensors to the correct device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get model output
    outputs = model(**inputs)
    
    # Get logits
    logits = outputs.logits.detach().cpu().numpy()[0]
    
    # Get probabilities using softmax
    probabilities = softmax(logits)
    
    # Create a list of (assignee, probability) pairs
    if isTraining:
        assignee_probs = list(zip(label_encoder.classes_, probabilities))
    else:
        assignee_probs = list(zip(labelsFromJson, probabilities))
    
    # Sort by probability in descending order
    ranked_assignees = sorted(assignee_probs, key=lambda x: x[1], reverse=True)
    
    return ranked_assignees

# Example prediction with both title and body
ranked_list = predict_assignee("Side bar always showing when opening VS Code", "When opening VS Code, the sidebar is always visible")

for index, (assignee, probability) in enumerate(ranked_list):
    print(f"{index + 1}: {assignee}: {probability:.2f}")



  device = torch.device('mps') if torch.has_mps else torch.device('cpu')


NameError: name 'label_encoder' is not defined