In [1]:
%pip install transformers torch accelerate datasets evaluate numpy pandas scikit-learn

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [12]:
import pandas as pd

# Load your dataset
df = pd.read_csv('cleaned_output2.csv')
df = df.dropna()
print(df.head())


   number                                              title  assignee  \
0       3                       omnisharp includ linux build   jrieken   
1       4                  c# bracket insertion, indent work  alexdima   
2       5                          welcome.md packag wrongli   isidorn   
3       6  window - omnisharp provid intellisens strong n...   jrieken   
4       8                      flash open workspaces, reload   jrieken   

                                                body  
0  our linux build machin includ csharp-o/**bin**...  
1            type ( expect => close ) actual => noth  
2  vscode > help > show welcom > work if i look c...  
3  upgrad rc1 dnx/runtim ``` bash git clone https...  
4  start code set dark theme _other default dark ...  


In [13]:
from sklearn.preprocessing import LabelEncoder

# Encode the assignee names
label_encoder = LabelEncoder()
df['assignee_encoded'] = label_encoder.fit_transform(df['assignee'])

# Split into input features (titles) and labels (encoded assignees)
titles = df['title'].tolist()
labels = df['assignee_encoded'].tolist()

In [8]:
titles

['omnisharp includ linux build',
 'c# bracket insertion, indent work',
 'welcome.md packag wrongli',
 'window - omnisharp provid intellisens strong name assembl',
 'flash open workspaces, reload',
 'extens development: null undefin valu print debug repl',
 "cannot find modul 'sinon' run test/run.sh termin",
 '[clojure] block comment color differ',
 'bad color kimbi dark theme languag',
 'c#: line/block comment work',
 'css: toggl line comment preserv',
 '[f#] block comment color comment',
 'html: not auto-clos quot type attribut',
 'python: block comment three singl quot two',
 'php: extens console.log complet function',
 '[ruby] mode fail token input',
 '[vb] block comment color',
 'develop environ improv (code.bat/code.sh)',
 '[xml] element attribut light dark theme use color',
 'typescript: error: promis implement oncancel(…)',
 'https://twitter.com/cod use visual studio icon confus',
 'no blue icon linux',
 'send smile > "submit bug" open github repo',
 'variabl local window stop s

In [14]:
from transformers import AutoTokenizer

# Choose a model, e.g., 'distilbert-base-uncased'
# model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the input titles
inputs = tokenizer(titles, padding=True, truncation=True, return_tensors='pt', max_length=128).to("cuda")




In [15]:
from datasets import Dataset

# Create a Hugging Face dataset
dataset = Dataset.from_dict({
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'labels': labels
})

# Split the dataset into training and validation sets
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test['train']
test_dataset = train_test['test']


In [16]:
import evaluate
import numpy as np

# Load metric functions
accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load('f1')

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall'],
        'f1': f1['f1'],
    }


In [17]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9898,1.960003,0.478356,0.455346,0.478356,0.453225
2,1.7799,1.866324,0.50014,0.476545,0.50014,0.47306
3,1.6602,1.844463,0.507974,0.491152,0.507974,0.486037


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=18765, training_loss=1.9265706819310178, metrics={'train_runtime': 1476.1227, 'train_samples_per_second': 203.39, 'train_steps_per_second': 12.712, 'total_flos': 9961223327695872.0, 'train_loss': 1.9265706819310178, 'epoch': 3.0})

In [18]:
trainer.evaluate()


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.8444628715515137,
 'eval_accuracy': 0.5079739398057477,
 'eval_precision': 0.49115201931887786,
 'eval_recall': 0.5079739398057477,
 'eval_f1': 0.48603724957545463,
 'eval_runtime': 33.0092,
 'eval_samples_per_second': 757.94,
 'eval_steps_per_second': 11.845,
 'epoch': 3.0}

## CUDA

In [20]:
def predict_assignee(title):
    inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=128).to('cuda')
    outputs = model(**inputs)
    predicted_class = outputs.logits.argmax(dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]

# Example prediction
print(predict_assignee("Side bar always showing when opening VS Code"))


sbatten


## MACOS

In [23]:
import torch

# Check if MPS is available
device = torch.device('mps') if torch.has_mps else torch.device('cpu')

# Move the model to the correct device
model.to(device)

def predict_assignee(title):
    # Tokenize the input
    inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Move input tensors to the correct device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get model output
    outputs = model(**inputs)

    # Get the predicted class
    predicted_class = outputs.logits.argmax(dim=1).item()

    # Return the assignee name
    return label_encoder.inverse_transform([predicted_class])[0]

# Example prediction
print(predict_assignee("Change the name"))


aeschli


  device = torch.device('mps') if torch.has_mps else torch.device('cpu')
