# Project Part 3

[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/brearenee/NLP-Project/blob/main/Part3-StarTrek.ipynb)



**NLP Problem:** Predicting the speaker from Star Trek: The Next Generation script lines for 8 main characters.

In this second phase of my project, I'm developing a deep learning model for this NLP task.

As learned in Part 1 and Part 2, the initial dataset's structure is less than ideal. To start Part 3, we must once again parse and clean the raw JSON data and transform it into a structured DataFrame.

In [7]:
#!pip install transformers
#!pip install transformers pandas torch
import pandas as pd
import json
import requests

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertForSequenceClassification
from transformers import AdamW
from sklearn.metrics import accuracy_score


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    print(torch.cuda.get_device_name(0))
    print("Memory Usage:")
    print("Allocated:", round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1), "GB")
    print("Cached:   ", round(torch.cuda.memory_reserved(0) / 1024 ** 3, 1), "GB")




Using device: cuda
Tesla T4
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [8]:
url = 'https://raw.githubusercontent.com/brearenee/NLP-Project/main/dataset/StarTrekDialogue_v2.json'
response = requests.get(url)

##This CodeBlock is thanks to ChatGPT :-) 
if response.status_code == 200:
    json_data = json.loads(response.text)
    lines = []
    characters = []
    episodes = []
  
    # extract the information from the JSON file for the "TNG" series
    for series_name, series_data in json_data.items():
        if series_name == "TNG": 
            for episode_name, episode_data in series_data.items():
                for character_name, character_lines in episode_data.items():
                    for line_text in character_lines:
                        lines.append(line_text)
                        characters.append(character_name)
                        episodes.append(episode_name)
                     
    # Create a DataFrame from the extracted data
    df = pd.DataFrame({
        'Line': lines,
        'Character': characters,
    })

    # Remove duplicate lines, keeping the first occurrence (preserving the original order)
    df = df.drop_duplicates(subset='Line', keep='first')

    # Reset the index of the DataFrame
    df.reset_index(drop=True, inplace=True)

else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")
    
    
##Remove Outliers (Characters with less than 1000 lines)
character_counts = df['Character'].value_counts()
characters_to_remove = character_counts[character_counts < 1000].index
df = df[~df['Character'].isin(characters_to_remove)]

##Print Value Count. 
print(df['Character'].value_counts())


Character
PICARD     10798
RIKER       6454
DATA        5699
LAFORGE     4111
WORF        3185
CRUSHER     2944
TROI        2856
WESLEY      1206
Name: count, dtype: int64


# BERT 
Bidirectional Encoder Representations from Transformers. 

Because this is a classification task,  BERT seems like a good choice for a pre-trained deep learning model. 



In [9]:

##Split the dataset
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input sequences
train_tokenized = tokenizer(list(train_df['Line']), padding=True, truncation=True, return_tensors='pt')
val_tokenized = tokenizer(list(val_df['Line']), padding=True, truncation=True, return_tensors='pt')

# Convert labels to tensor with the correct data type (Long)
train_labels = torch.tensor(train_df['Character'].astype('category').cat.codes.values, dtype=torch.long)
val_labels = torch.tensor(val_df['Character'].astype('category').cat.codes.values, dtype=torch.long)

# Create TensorDatasets
train_dataset = TensorDataset(
    train_tokenized['input_ids'],
    train_tokenized['attention_mask'],
    train_labels
)

val_dataset = TensorDataset(
    val_tokenized['input_ids'],
    val_tokenized['attention_mask'],
    val_labels
)

# Set up training parameters
epochs = 3
batch_size = 25  

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print("Tensor Datasets and DataLoader created")



Tensor Datasets and DataLoader created


In [10]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#Optimizer and Loss Function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
print("optimizer and loss function")

optimizer and loss function


In [12]:
# Training Loop
accumulation_steps = 4

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    print_count = 1
    print(print_count)
    print_count = print_count + 1
    for i in range(0, len(train_dataset), batch_size):
        batch_inputs = (
            train_dataset.tensors[0][i:i+batch_size],
            train_dataset.tensors[1][i:i+batch_size]
        )
        batch_labels = train_dataset.tensors[2][i:i+batch_size]

        optimizer.zero_grad()

        outputs = model(*batch_inputs)
        loss = criterion(outputs.logits, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    avg_loss = total_loss / (len(train_dataset) / batch_size)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_loss}")

1


KeyboardInterrupt: 

In [None]:
# After training, you might want to evaluate on the validation set
model.eval()  # Set the model to evaluation mode
val_loss = 0

with torch.no_grad():
    for i in range(0, len(val_dataset), batch_size):
        batch_inputs = (
            val_dataset.tensors[0][i:i+batch_size],
            val_dataset.tensors[1][i:i+batch_size]
        )
        batch_labels = val_dataset.tensors[2][i:i+batch_size]

        outputs = model(*batch_inputs)
        loss = criterion(outputs.logits, batch_labels)
        val_loss += loss.item()

avg_val_loss = val_loss / (len(val_dataset) / batch_size)
print(f"Validation Loss: {avg_val_loss}")