In [3]:
import pandas as pd
from langdetect import detect, LangDetectException
spotify_data = pd.read_csv('./data/dataset.csv')
# Assuming spotify_data is already loaded
spotify_data_clean = spotify_data[spotify_data['popularity'] > 0]

# def is_english(text):
#     try:
#         return detect(text) == 'en'
#     except LangDetectException:
#         return False

# # Apply the function to filter only English names
# spotify_data_clean = spotify_data_clean[spotify_data_clean['track_name'].apply(is_english)]


In [5]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel, AdamW
from sklearn.model_selection import train_test_split
from transformers.modeling_outputs import SequenceClassifierOutput

class PopularityDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(target, dtype=torch.float)
        }

# Custom model for regression
class DistilBertForRegression(torch.nn.Module):
    def __init__(self):
        super(DistilBertForRegression, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.regressor = torch.nn.Linear(self.distilbert.config.dim, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0]  # Use the [CLS] token
        return self.regressor(hidden_state)

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create datasets
texts = spotify_data_clean['track_name'].tolist()
targets = spotify_data_clean['popularity'].tolist()
train_texts, val_texts, train_targets, val_targets = train_test_split(texts, targets, test_size=0.1, random_state=42)

train_dataset = PopularityDataset(train_texts, train_targets, tokenizer, max_len=128)
val_dataset = PopularityDataset(val_texts, val_targets, tokenizer, max_len=128)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load model
model = DistilBertForRegression()

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3  # Define the number of epochs
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        loss = torch.nn.functional.mse_loss(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} Loss {loss.item()}")

# Validation loop here
# ...


  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 