In [None]:
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [None]:
with open("gig_workers_synthdata.json", "r") as f:
    train_data = json.load(f)

In [None]:
# train_data

In [None]:
train_records = []
for record in train_data:
    input_data = record['input']
    output_data = record['output']
    
    # Construct a single record for the DataFrame
    train_records.append({
        'input': f"Worker ID: {input_data['Worker ID']}, Task History: {input_data['Task History']}, "
                  f"Earnings: {input_data['Earnings']}, Rating: {input_data['Rating']}, "
                  f"Current Skills: {input_data['Current Skills']}, "
                  f"Experience Level: {input_data['Experience Level']}, "
                  f"Job Count: {input_data['Job Count']}, "
                  f"Time Spent on Gigs: {input_data['Time Spent on Gigs']}",
        'output': output_data['Recommended Skills']
    })

train_df = pd.DataFrame(train_records)

# Extract unique skills from the output
unique_skills = set()
for output in train_df['output']:
    unique_skills.update(output.split(", "))  # Assuming skills are separated by commas

# Create a sorted list of unique skills
unique_skills = sorted(list(unique_skills))


In [None]:
class SkillRecommendationDataset(Dataset):
    def __init__(self, data, tokenizer, skill_set):
        self.data = data
        self.tokenizer = tokenizer
        self.skill_set = skill_set  # List of unique skills

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['input']
        output_text = self.data.iloc[idx]['output']

        # Ensure that input and output are strings
        if not isinstance(input_text, str) or not isinstance(output_text, str):
            raise ValueError(f"Input or output at index {idx} is not a string.")

        # Tokenize the input
        inputs = self.tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        # Prepare the label
        labels = torch.zeros(len(self.skill_set))  # Create a tensor of zeros for each unique skill
        recommended_skills = output_text.split(", ")  # Split the output into individual skills
        for skill in recommended_skills:
            if skill in self.skill_set:
                labels[self.skill_set.index(skill)] = 1  # Set to 1 for skills that are recommended

        labels = labels.long()  # Ensure labels are of type long

        # Remove the batch dimension
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}

        # Add labels to the input dictionary
        inputs['labels'] = labels

        print(f"Input IDs size: {inputs['input_ids'].size()}")
        print(f"Labels size: {labels.size()}")
    
        return inputs

In [None]:
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Create the dataset and dataloader
train_dataset = SkillRecommendationDataset(train_df, tokenizer, unique_skills)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [None]:
num_labels = len(unique_skills) 
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# training arguments
training_args = TrainingArguments(
    output_dir="./distilbert-skill-recommendation",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=5,
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./distilbert-skill-recommendation")
tokenizer.save_pretrained("./distilbert-skill-recommendation")