In [1]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
#from torchsummary import summary
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import json

2023-06-15 21:27:11.048778: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-15 21:27:15.578720: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.out = nn.Linear(768, 1)
        
    def forward(self,ids,mask,token_type_ids):
        _,o2= self.bert_model(ids,attention_mask=mask,token_type_ids=token_type_ids, return_dict=False)
        
        out= self.out(o2)
        
        return out

In [2]:
class FriendsDataset(Dataset):
    def __init__(self, data_file, tokenizer):
        self.data_file = data_file
        self.tokenizer = tokenizer
        self.encoder = LabelEncoder()
        self.utterances, self.labels = self.load_data()

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):
        return self.utterances[idx], self.labels[idx]

    def load_data(self):
        with open(self.data_file, 'r') as file:
            data = json.load(file)

        utterances = []
        labels = []
        for utterance in data:
            if utterance['speakers'] != []:
                utterances.append(utterance['transcript'])

                if utterance['speakers'][0] not in  ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                    labels.append("Other")
                else:
                    labels.append(utterance['speakers'][0])

        self.encoder.fit(labels)
        labels = self.encoder.transform(labels)

        return utterances, labels

    def preprocess_data(self, max_sequence_length):
        tokenized_utterances = [self.tokenizer.encode(utterance)[:max_sequence_length] for utterance in self.utterances]
        padded_utterances = []
        for tokenized_utterance in tokenized_utterances:
            padded_utterance = list(tokenized_utterance)
            if len(padded_utterance) < max_sequence_length:
                padded_utterance.extend([0] * (max_sequence_length - len(padded_utterance)))
            padded_utterances.append(padded_utterance)

        labels = torch.tensor(self.labels)

        return padded_utterances, labels

In [4]:
'''class Classifier:
    def __init__(self, num_classes, embedding_dim):
        super(Classifier, self).__init__()
        self.num_classes = num_classes
        self.embedding_dim = embedding_dim
        self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
        self.fc = nn.Sequential(
            nn.Linear(self.embedding_dim, 256),
            nn.ReLU(),
            nn.Linear(256, self.num_classes),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        embeddings = self.sentence_transformer.encode(x)
        return self.fc(embeddings)'''

class Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(Classifier, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)#eu sou amiga :)
        self.relu = nn.ReLU()
        self.linear_layer = nn.Linear(embedding_dim, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding_layer(x)
        embedded = torch.mean(embedded, dim=1)
        hidden = self.relu(self.linear_layer(embedded))
        output = self.output_layer(hidden)
        return output

In [5]:

# Load the dataset from the JSON file
with open('sets/test_set1.json') as file:
    dataset = json.load(file)

# Calculate the vocabulary size
vocab = set()
for utterance in dataset:
    if utterance['speakers'] != []:
        words = utterance['transcript'].split()
        vocab.update(words)

size = len(vocab)

# Load the dataset from the JSON file
with open('sets/train_set1.json') as file:
    dataset = json.load(file)

# Calculate the vocabulary size
vocab = set()
for utterance in dataset:
    if utterance['speakers'] != []:
        words = utterance['transcript'].split()
        vocab.update(words)

size2 = len(vocab)

vocab_size = size + size2

print(vocab_size)

48054


In [6]:
# Set hyperparameters
num_classes = 7
embedding_dim = 64
max_sequence_length = 1000000
num_epochs = 10
batch_size = 32

hidden_dim = 256

# Instantiate data loader and load/preprocess data
tokenizer = SentenceTransformer('all-MiniLM-L6-v2')

test = FriendsDataset('sets/test_set1.json', tokenizer)
X_test, y_test = test.preprocess_data(max_sequence_length)

train = FriendsDataset('sets/train_set1.json', tokenizer)
X_train, y_train = train.preprocess_data(max_sequence_length)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)

# Create data loaders
train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(list(zip(X_test, y_test)), batch_size=batch_size)

# Instantiate the speaker classifier model
model = Classifier(vocab_size, embedding_dim, hidden_dim, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005)

# Training loop
for epoch in range(num_epochs):

    model.train()
    
    for utterances, labels in train_loader:
        optimizer.zero_grad()

        utterances = utterances.long()
        outputs = model(utterances)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for utterances, labels in test_loader:
            utterances = utterances.long()
            outputs = model(utterances)
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Epoch [{epoch +1 }], Test Accuracy: {accuracy:.2f}%")