In [15]:
import torch
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datasets import Dataset
from transformers import DistilBertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [9]:
df = pd.read_csv("preprocessed-upwork-jobds.csv")
df.head()

Unnamed: 0,title,description,budget
0,Experienced Media Buyer For Solar Pannel and R...,We’re looking for a talented and hardworking a...,500.0
1,Full Stack Developer,Job Title: Full Stack DeveloperWe are seeking ...,1100.0
2,Data Engineer,We are looking for a resource who can work par...,650.0
3,Want to fix the WordPress Plugin,I am currently facing some issues with a custo...,5.0
4,PHP/HTML/CSS WordPress Developer Needed for We...,**Description:**We are seeking an experienced ...,500.0


In [11]:
df["text"] = df["title"] + "\n" + df["description"]
df["labels"] =  df["budget"].astype(float)
df = df[["text", "labels"]]
df.head()

Unnamed: 0,text,labels
0,Experienced Media Buyer For Solar Pannel and R...,500.0
1,Full Stack Developer\nJob Title: Full Stack De...,1100.0
2,Data Engineer\nWe are looking for a resource w...,650.0
3,Want to fix the WordPress Plugin\nI am current...,5.0
4,PHP/HTML/CSS WordPress Developer Needed for We...,500.0


In [12]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)


def tokenize_function(sample):
    return tokenizer(sample["text"], padding="max_length", truncation=True)


train_data, test_data = train_test_split(df, test_size=0.2)
train_dataset = Dataset.from_pandas(train_data).map(tokenize_function, batched=True)
test_dataset = Dataset.from_pandas(test_data).map(tokenize_function, batched=True)

Map:   0%|          | 0/17498 [00:00<?, ? examples/s]

Map:   0%|          | 0/4375 [00:00<?, ? examples/s]

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    problem_type="regression",
)


def compute_metrics(pred):
    """Calculate evaluation metrics."""
    labels = pred.label_ids
    preds = pred.predictions.squeeze()
    mse = mean_squared_error(labels, preds)
    return {"mse": mse}


trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
    ),
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
trainer.train()

  0%|          | 0/3282 [00:00<?, ?it/s]

{'loss': 151749918.72, 'grad_norm': 91086.4453125, 'learning_rate': 1.695307739183425e-05, 'epoch': 0.46}
{'loss': 136619843.584, 'grad_norm': 150472.65625, 'learning_rate': 1.3906154783668494e-05, 'epoch': 0.91}


  0%|          | 0/547 [00:00<?, ?it/s]

{'eval_loss': 36471908.0, 'eval_mse': 36471908.0, 'eval_runtime': 208.4805, 'eval_samples_per_second': 20.985, 'eval_steps_per_second': 2.624, 'epoch': 1.0}
{'loss': 16733231.104, 'grad_norm': 9068.92578125, 'learning_rate': 1.0859232175502743e-05, 'epoch': 1.37}
{'loss': 147028361.216, 'grad_norm': 196832.90625, 'learning_rate': 7.81230956733699e-06, 'epoch': 1.83}


  0%|          | 0/547 [00:00<?, ?it/s]

{'eval_loss': 36375020.0, 'eval_mse': 36375024.0, 'eval_runtime': 206.6966, 'eval_samples_per_second': 21.166, 'eval_steps_per_second': 2.646, 'epoch': 2.0}
{'loss': 144216571.904, 'grad_norm': 72221.828125, 'learning_rate': 4.765386959171238e-06, 'epoch': 2.29}
{'loss': 266547675.136, 'grad_norm': 98645.4921875, 'learning_rate': 1.7184643510054846e-06, 'epoch': 2.74}


  0%|          | 0/547 [00:00<?, ?it/s]

{'eval_loss': 36346284.0, 'eval_mse': 36346284.0, 'eval_runtime': 203.8332, 'eval_samples_per_second': 21.464, 'eval_steps_per_second': 2.684, 'epoch': 3.0}
{'train_runtime': 8398.1774, 'train_samples_per_second': 6.251, 'train_steps_per_second': 0.391, 'train_loss': 132834888.85313833, 'epoch': 3.0}


TrainOutput(global_step=3282, training_loss=132834888.85313833, metrics={'train_runtime': 8398.1774, 'train_samples_per_second': 6.251, 'train_steps_per_second': 0.391, 'total_flos': 6953619614902272.0, 'train_loss': 132834888.85313833, 'epoch': 3.0})

In [22]:
model.save_pretrained("./output")
tokenizer.save_pretrained("./output")

('./output/tokenizer_config.json',
 './output/special_tokens_map.json',
 './output/vocab.txt',
 './output/added_tokens.json')

In [35]:
title = "Middle Python Developer"
description = "We are OX Company, a team that creates powerful web applications and software for any architectural solutions. Our products have already been highly appreciated by both private clients and businesses. We are currently looking for a Python Developer to help us develop and maintain our projects. If you love coding in Python and want to work on exciting tasks, we are looking for you!"
text = title + "\n" + description
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True).to("mps")
outputs = model(**inputs)

In [38]:
print(f"Predicted budget: {outputs.logits.detach().cpu().numpy()[0][0]}")

Predicted budget: 187.9017333984375
