In [9]:
%pip install transformers torch accelerate datasets evaluate numpy pandas scikit-learn -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mNote: you may need to restart the kernel to use updated packages.


In [21]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
isTraining = True

In [4]:
import pandas as pd
import cleaning_tool as ct

# Load your dataset
df = pd.read_csv('cleaned_output2.csv')
df = df.dropna()
df = ct.filter_single_users(dataframe=df, min_pull=5)
print(df.head())
df


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/SA24-G2/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/SA24-G2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   number                                              title  assignee  \
0       3                       omnisharp includ linux build   jrieken   
1       4                  c# bracket insertion, indent work  alexdima   
2       5                          welcome.md packag wrongli   isidorn   
3       6  window - omnisharp provid intellisens strong n...   jrieken   
4       8                      flash open workspaces, reload   jrieken   

                                                body  
0  our linux build machin includ csharp-o/**bin**...  
1            type ( expect => close ) actual => noth  
2  vscode > help > show welcom > work if i look c...  
3  upgrad rc1 dnx/runtim ``` bash git clone https...  
4  start code set dark theme _other default dark ...  


Unnamed: 0,number,title,assignee,body
0,3,omnisharp includ linux build,jrieken,our linux build machin includ csharp-o/**bin**...
1,4,"c# bracket insertion, indent work",alexdima,type ( expect => close ) actual => noth
2,5,welcome.md packag wrongli,isidorn,vscode > help > show welcom > work if i look c...
3,6,window - omnisharp provid intellisens strong n...,jrieken,upgrad rc1 dnx/runtim ``` bash git clone https...
4,8,"flash open workspaces, reload",jrieken,start code set dark theme _other default dark ...
...,...,...,...,...
126193,219988,featur request: disabl compact folder sourc co...,lszomoru,<!-- do not delet this! feature_request_templ...
126194,219990,intellisens work front matter astro.j project,mjbvz,type: <b>bug</b> previous i use version 1.90 a...
126195,219992,vs code launch,joaomoreno,<!-- do not delet this! bug_report_templ -->...
126196,219995,issu,justschen,<!-- do not delet this! bug_report_templ -->...


In [5]:
from sklearn.preprocessing import LabelEncoder

# Encode the assignee names
label_encoder = LabelEncoder()
df['assignee_encoded'] = label_encoder.fit_transform(df['assignee'])

df['input_text'] = "<#TITLE-START#> " + df['title'] + " <#TITLE-END#> <#BODY-START#> " + df['body'] + " <#BODY-END#>"


# Split into input features (titles) and labels (encoded assignees)
titles = df['input_text'].tolist()
labels = df['assignee_encoded'].tolist()

In [7]:
titles

['<#TITLE-START#> omnisharp includ linux build <#TITLE-END#> <#BODY-START#> our linux build machin includ csharp-o/**bin** folder. 1. run `scripts/npm.sh install` -> csharp-o/**bin** folder nice get creat linux machine. 2. run `gulp vscode-linux-x64` also nice creat csharp-o/**bin** folder linux machine. someth strang build machin <#BODY-END#>',
 '<#TITLE-START#> c# bracket insertion, indent work <#TITLE-END#> <#BODY-START#> type ( expect => close ) actual => noth <#BODY-END#>',
 '<#TITLE-START#> welcome.md packag wrongli <#TITLE-END#> <#BODY-START#> vscode > help > show welcom > work if i look content vscode packag i see top level resourc folder contain welcome.md. thi look badli place <#BODY-END#>',
 '<#TITLE-START#> window - omnisharp provid intellisens strong name assembl <#TITLE-END#> <#BODY-START#> upgrad rc1 dnx/runtim ``` bash git clone https://github.com/natemcmaster/test-vscode-strong-nam cd test-vscode-strong-nam dnu restor code . ``` pick test project.json result => "intern

In [10]:
from transformers import AutoTokenizer

# Choose a model, e.g., 'distilbert-base-uncased'
# model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the input titles
# inputs = tokenizer(titles, padding=True, truncation=True, return_tensors='pt', max_length=128)
inputs = tokenizer(titles, padding=True, truncation=True, return_tensors='pt', max_length=128).to("cuda")


In [11]:
from datasets import Dataset

# Create a Hugging Face dataset
dataset = Dataset.from_dict({
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'labels': labels
})

# Split the dataset into training and validation sets
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test['train']
test_dataset = train_test['test']

125053

In [12]:
import evaluate
import numpy as np

# Load metric functions
accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load('f1')

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted', zero_division=0)
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall'],
        'f1': f1['f1'],
    }
    

In [13]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Define training arguments with early stopping and final model saving
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',  # Save at the end of each epoch
    save_total_limit=1,  # Keep only the most recent model
    load_best_model_at_end=True,  # Automatically load the best model at the end
    metric_for_best_model='eval_loss',  # Use validation loss to select the best model
    greater_is_better=False,  # Lower loss is better
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Early stopping with patience
)

# Train the model
trainer.train()



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8203,1.746415,0.537563,0.522255,0.537563,0.51478
2,1.5322,1.594392,0.573588,0.567452,0.573588,0.560375
3,1.36,1.557776,0.585582,0.577115,0.585582,0.57206


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=18759, training_loss=1.7063397215820781, metrics={'train_runtime': 1452.9735, 'train_samples_per_second': 206.56, 'train_steps_per_second': 12.911, 'total_flos': 9952698775366656.0, 'train_loss': 1.7063397215820781, 'epoch': 3.0})

In [41]:
trainer.evaluate()


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 1/1 [00:00<00:00, 80.72it/s]


{'eval_loss': 1.9498947858810425,
 'eval_accuracy': 0.2608695652173913,
 'eval_precision': 0.12087912087912088,
 'eval_recall': 0.2608695652173913,
 'eval_f1': 0.15703324808184146,
 'eval_runtime': 0.16,
 'eval_samples_per_second': 143.788,
 'eval_steps_per_second': 6.252,
 'epoch': 3.0}

## Reload model from file

In [22]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import json

model = AutoModelForSequenceClassification.from_pretrained('./results/checkpoint-18759')
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
with open("labels_json.json") as f:
    labelsFromJson = json.load(f)
isTraining = False



## CUDA

In [24]:
import torch
from scipy.special import softmax

# Check if CUDA is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move the model to the correct device
model.to(device)

def predict_assignee(title, body):
    # Concatenate title and body
    combined_input = title + " " + body
    
    # Tokenize the input
    inputs = tokenizer(combined_input, return_tensors='pt', padding=True, truncation=True, max_length=128)
    
    # Move input tensors to the correct device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get model output
    outputs = model(**inputs)
    
    # Get logits
    logits = outputs.logits.detach().cpu().numpy()[0]
    
    # Get probabilities using softmax
    probabilities = softmax(logits)
    
    # Create a list of (assignee, probability) pairs
    if isTraining:
        assignee_probs = list(zip(label_encoder.classes_, probabilities))
    else:
        assignee_probs = list(zip(labelsFromJson, probabilities))
    
    # Sort by probability in descending order
    ranked_assignees = sorted(assignee_probs, key=lambda x: x[1], reverse=True)
    
    return ranked_assignees

# Example prediction with both title and body
ranked_list = predict_assignee("Side bar always showing when opening VS Code", "When opening VS Code, the sidebar is always visible")

for index, (assignee, probability) in enumerate(ranked_list):
    print(f"{index + 1}: {assignee}: {probability:.2f}")


1: sbatten: 0.69
2: bpasero: 0.14
3: isidorn: 0.05
4: deepak1556: 0.03
5: sandy081: 0.01
6: miguelsolorio: 0.01
7: joaomoreno: 0.01
8: benibenj: 0.00
9: lramos15: 0.00
10: mjbvz: 0.00
11: JacksonKearl: 0.00
12: jrieken: 0.00
13: Tyriar: 0.00
14: alexdima: 0.00
15: roblourens: 0.00
16: chrmarti: 0.00
17: aeschli: 0.00
18: stevencl: 0.00
19: daviddossett: 0.00
20: RMacfarlane: 0.00
21: rebornix: 0.00
22: alexr00: 0.00
23: egamma: 0.00
24: meganrogge: 0.00
25: chrisdias: 0.00
26: joyceerhl: 0.00
27: rzhao271: 0.00
28: connor4312: 0.00
29: eamodio: 0.00
30: weinand: 0.00
31: TylerLeonhardt: 0.00
32: bgashler1: 0.00
33: lszomoru: 0.00
34: justschen: 0.00
35: dbaeumer: 0.00
36: kieferrm: 0.00
37: ramya-rao-a: 0.00
38: octref: 0.00
39: bhavyaus: 0.00
40: cleidigh: 0.00
41: hediet: 0.00
42: ulugbekna: 0.00
43: aiday-mar: 0.00
44: andreamah: 0.00
45: gregvanl: 0.00
46: DonJayamanne: 0.00
47: michelkaporin: 0.00
48: amunger: 0.00
49: tanhakabir: 0.00
50: Yoyokrazy: 0.00
51: digitarald: 0.00
52: 

## MACOS

In [3]:
import torch
from scipy.special import softmax

# Check if MPS is available
device = torch.device('mps') if torch.has_mps else torch.device('cpu')

# Move the model to the correct device
model.to(device)

# def predict_assignee(title):
#     # Tokenize the input
#     inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=128)

#     # Move input tensors to the correct device
#     inputs = {key: value.to(device) for key, value in inputs.items()}

#     # Get model output
#     outputs = model(**inputs)

#     # Get the predicted class
#     predicted_class = outputs.logits.argmax(dim=1).item()

#     # Return the assignee name
#     return label_encoder.inverse_transform([predicted_class])[0]

# Example prediction
# print(predict_assignee("Change the name"))

def predict_assignee(title, body):
    # Concatenate title and body
    combined_input = title + " " + body
    
    # Tokenize the input
    inputs = tokenizer(combined_input, return_tensors='pt', padding=True, truncation=True, max_length=128)
    
    # Move input tensors to the correct device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get model output
    outputs = model(**inputs)
    
    # Get logits
    logits = outputs.logits.detach().cpu().numpy()[0]
    
    # Get probabilities using softmax
    probabilities = softmax(logits)
    
    # Create a list of (assignee, probability) pairs
    if isTraining:
        assignee_probs = list(zip(label_encoder.classes_, probabilities))
    else:
        assignee_probs = list(zip(labelsFromJson, probabilities))
    
    # Sort by probability in descending order
    ranked_assignees = sorted(assignee_probs, key=lambda x: x[1], reverse=True)
    
    return ranked_assignees

# Example prediction with both title and body
ranked_list = predict_assignee("Side bar always showing when opening VS Code", "When opening VS Code, the sidebar is always visible")

for index, (assignee, probability) in enumerate(ranked_list):
    print(f"{index + 1}: {assignee}: {probability:.2f}")



  device = torch.device('mps') if torch.has_mps else torch.device('cpu')


NameError: name 'label_encoder' is not defined