In [1]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
#from torchsummary import summary
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import json



In [None]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.out = nn.Linear(768, 1)
        
    def forward(self,ids,mask,token_type_ids):
        _,o2= self.bert_model(ids,attention_mask=mask,token_type_ids=token_type_ids, return_dict=False)
        
        out= self.out(o2)
        
        return out

In [19]:
class FriendsDataset(Dataset):
    def __init__(self, data_file, tokenizer):
        self.data_file = data_file
        self.tokenizer = tokenizer
        self.encoder = LabelEncoder()
        self.utterances, self.labels = self.load_data()

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):
        return self.utterances[idx], self.labels[idx]

    def load_data(self):
        with open(self.data_file, 'r') as file:
            data = json.load(file)

        utterances = []
        labels = []
        for utterance in data:
            utterances.append(utterance['transcript'])

            if utterance['speakers'][0] not in  ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                labels.append("Other")
            else:
                labels.append(utterance['speakers'][0])

        self.encoder.fit(labels)
        labels = self.encoder.transform(labels)

        return utterances, labels

    def preprocess_data(self, max_sequence_length):
        tokenized_utterances = [self.tokenizer.encode(utterance)[:max_sequence_length] for utterance in self.utterances]
        padded_utterances = []
        for tokenized_utterance in tokenized_utterances:
            padded_utterance = list(tokenized_utterance)
            if len(padded_utterance) < max_sequence_length:
                padded_utterance.extend([0] * (max_sequence_length - len(padded_utterance)))
            padded_utterances.append(padded_utterance)

        labels = torch.tensor(self.labels)

        return padded_utterances, labels

In [5]:
class Classifier:
    def __init__(self, num_classes, embedding_dim):
        super(Classifier, self).__init__()
        self.num_classes = num_classes
        self.embedding_dim = embedding_dim
        self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
        self.fc = nn.Sequential(
            nn.Linear(self.embedding_dim, 256),
            nn.ReLU(),
            nn.Linear(256, self.num_classes),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        embeddings = self.sentence_transformer.encode(x)
        return self.fc(embeddings)

In [20]:
# Set hyperparameters
num_classes = 7
embedding_dim = 768
max_sequence_length = 10000
num_epochs = 10
batch_size = 32

# Instantiate data loader and load/preprocess data
tokenizer = SentenceTransformer('all-MiniLM-L6-v2')

test = FriendsDataset('sets/test_set1.json', tokenizer)
X_test, y_test = test.preprocess_data(max_sequence_length)

train = FriendsDataset('sets/train_set1.json', tokenizer)
X_train, y_train = train.preprocess_data(max_sequence_length)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)

# Create data loaders
train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(list(zip(X_test, y_test)), batch_size=batch_size)

# Instantiate the speaker classifier model
speaker_classifier = Classifier(num_classes, embedding_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(speaker_classifier.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    speaker_classifier.train()
    for utterances, labels in train_loader:
        optimizer.zero_grad()
        outputs = speaker_classifier(utterances)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    speaker_classifier.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for utterances, labels in test_loader:
            outputs = speaker_classifier(utterances)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Test Accuracy: {accuracy:.2f}%")

KeyboardInterrupt: 