In [6]:
%pip install transformers torch accelerate datasets evaluate numpy pandas scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/4c/1e/a7c7357e704459c7d56a18df4a0bf08669442d1f8878cc0864beccd6306a/scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.6.0 from https://files.pythonhosted.org/packages/47/78/b0c2c23880dd1e99e938ad49ccfb011ae353758a2dc5ed7ee59baff684c3/scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv('cleaned_output.csv')
print(df.head())


   number                                              title  assignee  \
0       3                       omnisharp includ linux build   jrieken   
1       4                  c# bracket insertion, indent work  alexdima   
2       5                          welcome.md packag wrongli   isidorn   
3       6  window - omnisharp provid intellisens strong n...   jrieken   
4       8                      flash open workspaces, reload   jrieken   

                                                body  
0  our linux build machin includ csharp-o/**bin**...  
1      type (\n\nexpect => close )\nactual => noth\n  
2  vscode > help > show welcom > work\n\nif i loo...  
3  upgrad rc1 dnx/runtim\n\n```bash\ngit clone ht...  
4  start code\nset dark theme*other than the defa...  


In [7]:
from sklearn.preprocessing import LabelEncoder

# Encode the assignee names
label_encoder = LabelEncoder()
df['assignee_encoded'] = label_encoder.fit_transform(df['assignee'])

# Split into input features (titles) and labels (encoded assignees)
titles = df['title'].tolist()
labels = df['assignee_encoded'].tolist()

In [9]:
from transformers import AutoTokenizer

# Choose a model, e.g., 'distilbert-base-uncased'
# model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the input titles
inputs = tokenizer(titles, padding=True, truncation=True, return_tensors='pt', max_length=128).to("cuda")


In [10]:
from datasets import Dataset

# Create a Hugging Face dataset
dataset = Dataset.from_dict({
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'labels': labels
})

# Split the dataset into training and validation sets
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test['train']
test_dataset = train_test['test']


In [11]:
import evaluate
import numpy as np

# Load metric functions
accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load('f1')

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall'],
        'f1': f1['f1'],
    }


Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 8.32MB/s]
Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 13.2MB/s]
Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<00:00, 7.93MB/s]
Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<00:00, 13.3MB/s]


In [12]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.760644,0.25,0.085714,0.25,0.125687
2,No log,2.708481,0.178571,0.073748,0.178571,0.10084
3,No log,2.688769,0.142857,0.061012,0.142857,0.079699


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=21, training_loss=2.7219850449335006, metrics={'train_runtime': 2.3122, 'train_samples_per_second': 141.425, 'train_steps_per_second': 9.082, 'total_flos': 2877278340564.0, 'train_loss': 2.7219850449335006, 'epoch': 3.0})

In [13]:
trainer.evaluate()


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.6887686252593994,
 'eval_accuracy': 0.14285714285714285,
 'eval_precision': 0.06101190476190476,
 'eval_recall': 0.14285714285714285,
 'eval_f1': 0.07969924812030074,
 'eval_runtime': 0.0528,
 'eval_samples_per_second': 530.277,
 'eval_steps_per_second': 18.938,
 'epoch': 3.0}

## CUDA

In [16]:
def predict_assignee(title):
    inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=128).to('cuda')
    outputs = model(**inputs)
    predicted_class = outputs.logits.argmax(dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]

# Example prediction
print(predict_assignee("Your issue title here"))


jrieken


## MACOS

In [23]:
import torch

# Check if MPS is available
device = torch.device('mps') if torch.has_mps else torch.device('cpu')

# Move the model to the correct device
model.to(device)

def predict_assignee(title):
    # Tokenize the input
    inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Move input tensors to the correct device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get model output
    outputs = model(**inputs)

    # Get the predicted class
    predicted_class = outputs.logits.argmax(dim=1).item()

    # Return the assignee name
    return label_encoder.inverse_transform([predicted_class])[0]

# Example prediction
print(predict_assignee("Change the name"))


aeschli


  device = torch.device('mps') if torch.has_mps else torch.device('cpu')
