# Project Part 3

[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/brearenee/NLP-Project/blob/main/Part3-StarTrek.ipynb)



**NLP Problem:** Predicting the speaker from Star Trek: The Next Generation script lines for 8 main characters.

In this second phase of my project, I'm developing a deep learning model for this NLP task.

As learned in Part 1 and Part 2, the initial dataset's structure is less than ideal. To start Part 3, we must once again parse and clean the raw JSON data and transform it into a structured DataFrame.

In [18]:
import pandas as pd
import json
import requests


import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
from sklearn.preprocessing import LabelBinarizer


In [19]:
url = 'https://raw.githubusercontent.com/brearenee/NLP-Project/main/dataset/StarTrekDialogue_v2.json'
response = requests.get(url)

##This CodeBlock is thanks to ChatGPT :-) 
if response.status_code == 200:
    json_data = json.loads(response.text)
    lines = []
    characters = []
    episodes = []
  
    # extract the information from the JSON file for the "TNG" series
    for series_name, series_data in json_data.items():
        if series_name == "TNG": 
            for episode_name, episode_data in series_data.items():
                for character_name, character_lines in episode_data.items():
                    for line_text in character_lines:
                        lines.append(line_text)
                        characters.append(character_name)
                        episodes.append(episode_name)
                     
    # Create a DataFrame from the extracted data
    df = pd.DataFrame({
        'Line': lines,
        'Character': characters,
    })

    # Remove duplicate lines, keeping the first occurrence (preserving the original order)
    df = df.drop_duplicates(subset='Line', keep='first')

    # Reset the index of the DataFrame
    df.reset_index(drop=True, inplace=True)

else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")
    
    
##Remove Outliers (Characters with less than 1000 lines)
character_counts = df['Character'].value_counts()
characters_to_remove = character_counts[character_counts < 1000].index
df = df[~df['Character'].isin(characters_to_remove)]

##Print Value Count. 
print(df['Character'].value_counts())


Character
PICARD     10798
RIKER       6454
DATA        5699
LAFORGE     4111
WORF        3185
CRUSHER     2944
TROI        2856
WESLEY      1206
Name: count, dtype: int64


# BERT 
Bidirectional Encoder Representations from Transformers. 

Because this is a classification task,  BERT seems like a good choice for a pre-trained deep learning model. 



In [20]:

##Split the dataset
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(df, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []

    for _, row in df.iterrows():
        encoded_dict = tokenizer.encode_plus(
            row['Line'],
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,  # Explicitly set truncation to True
            padding='max_length',  # Use 'max_length' for padding
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_input_ids, train_attention_masks = tokenize_text(train_df, tokenizer)
val_input_ids, val_attention_masks = tokenize_text(val_df, tokenizer)

#one hot encode the labels
label_binarizer = LabelBinarizer()
train_labels_one_hot = label_binarizer.fit_transform(train_df['Character'])
val_labels_one_hot = label_binarizer.transform(val_df['Character'])

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [21]:
#Create PyTorch DataLoaders for your training and validation sets:

train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(train_labels_one_hot, dtype=torch.float32))
val_dataset = TensorDataset(val_input_ids, val_attention_masks, torch.tensor(val_labels_one_hot, dtype=torch.float32))

batch_size = 25
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


Fine Tune the Pre-Trained BERT model

In [22]:
#Fine-tune a pre-trained BERT model for sequence classification:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['Character'].unique()))
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
loss_fct = torch.nn.BCEWithLogitsLoss()

epochs = 3
accumulation_steps = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fct(outputs.logits, labels)
        loss = loss / accumulation_steps  # Adjust the loss

        total_loss += loss.item()

        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {average_loss}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average Training Loss: 0.09263644951387104
Epoch 2/3, Average Training Loss: 0.07607410329785151
Epoch 3/3, Average Training Loss: 0.07106457880798848


Evaluate

In [24]:
model.eval()
val_loss = 0
correct_predictions = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Calculate loss (if needed)
        # loss = criterion(logits, labels)
        # val_loss += loss.item()

        # Convert logits to binary predictions
        predicted_labels = (torch.sigmoid(logits) > 0.5).to(torch.float32)

        # Calculate accuracy for each sample
        correct_predictions += (predicted_labels == labels).all(dim=1).sum().item()

# Calculate average loss and accuracy
average_val_loss = val_loss / len(val_dataloader)
accuracy = correct_predictions / len(val_df)

print(f'Average Validation Loss: {average_val_loss}, Accuracy: {accuracy}')


Average Validation Loss: 0.0, Accuracy: 0.26828613608911556
