# Project Part 3

[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/brearenee/NLP-Project/blob/main/Part3-StarTrek.ipynb)



**NLP Problem:** Predicting the speaker from Star Trek: The Next Generation script lines for 8 main characters.

In this second phase of my project, I'm developing a deep learning model for this NLP task.

As learned in Part 1 and Part 2, the initial dataset's structure is less than ideal. To start Part 3, we must once again parse and clean the raw JSON data and transform it into a structured DataFrame.

In [10]:
#!pip install transformers
#!pip install transformers pandas torch
!pip install cuml
import pandas as pd
import json
import requests

#from sklearn.model_selection import train_test_split
#from cuml.preprocessing.model_selection import train_test_split
#from cuml.metrics import accuracy_score
from transformers import BertTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertForSequenceClassification
from transformers import AdamW
#from sklearn.metrics import accuracy_score
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split



Collecting cuml
  Using cached cuml-0.6.1.post1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: cuml
  Building wheel for cuml (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[41 lines of output][0m
  [31m   [0m !!
  [31m   [0m 
  [31m   [0m         ********************************************************************************
  [31m   [0m         Please avoid running ``setup.py`` directly.
  [31m   [0m         Instead, use pypa/build, pypa/installer or other
  [31m   [0m         standards-based tools.
  [31m   [0m 
  [31m   [0m         See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
  [31m   [0m         ********************************************************************************
  [31m   [0m 
  

ModuleNotFoundError: No module named 'cuml'

In [None]:
url = 'https://raw.githubusercontent.com/brearenee/NLP-Project/main/dataset/StarTrekDialogue_v2.json'
response = requests.get(url)

##This CodeBlock is thanks to ChatGPT :-) 
if response.status_code == 200:
    json_data = json.loads(response.text)
    lines = []
    characters = []
    episodes = []
  
    # extract the information from the JSON file for the "TNG" series
    for series_name, series_data in json_data.items():
        if series_name == "TNG": 
            for episode_name, episode_data in series_data.items():
                for character_name, character_lines in episode_data.items():
                    for line_text in character_lines:
                        lines.append(line_text)
                        characters.append(character_name)
                        episodes.append(episode_name)
                     
    # Create a DataFrame from the extracted data
    df = pd.DataFrame({
        'Line': lines,
        'Character': characters,
    })

    # Remove duplicate lines, keeping the first occurrence (preserving the original order)
    df = df.drop_duplicates(subset='Line', keep='first')

    # Reset the index of the DataFrame
    df.reset_index(drop=True, inplace=True)

else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")
    
    
##Remove Outliers (Characters with less than 1000 lines)
character_counts = df['Character'].value_counts()
characters_to_remove = character_counts[character_counts < 1000].index
df = df[~df['Character'].isin(characters_to_remove)]

##Print Value Count. 
print(df['Character'].value_counts())


# BERT 
Bidirectional Encoder Representations from Transformers. 

Because this is a classification task,  BERT seems like a good choice for a pre-trained deep learning model. 



In [None]:

##Split the dataset
#train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


# Convert the DataFrame to PyTorch tensors
features_tensor = torch.tensor(df['features'].tolist(), dtype=torch.float32)
labels_tensor = torch.tensor(df['labels'].tolist(), dtype=torch.long)

# Combine features and labels into a single TensorDataset
dataset = torch.utils.data.TensorDataset(features_tensor, labels_tensor)

# Specify the sizes for training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Split the dataset into training and validation sets
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoader instances
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)



# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input sequences
train_tokenized = tokenizer(list(train_df['Line']), padding=True, truncation=True, return_tensors='pt')
val_tokenized = tokenizer(list(val_df['Line']), padding=True, truncation=True, return_tensors='pt')

# Convert labels to tensor with the correct data type (Long)
train_labels = torch.tensor(train_df['Character'].astype('category').cat.codes.values, dtype=torch.long)
val_labels = torch.tensor(val_df['Character'].astype('category').cat.codes.values, dtype=torch.long)

# Create TensorDatasets
train_dataset = TensorDataset(
    train_tokenized['input_ids'],
    train_tokenized['attention_mask'],
    train_labels
)

val_dataset = TensorDataset(
    val_tokenized['input_ids'],
    val_tokenized['attention_mask'],
    val_labels
)

# Set up training parameters
epochs = 3
batch_size = 25  

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print("Tensor Datasets and DataLoader created")



In [None]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)


In [None]:
#Optimizer and Loss Function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
print("optimizer and loss function")
print("train dataset")
print(len(train_dataset))

In [None]:
# Training Loop
accumulation_steps = 4
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)

for epoch in range(epochs):
    scheduler.step()
    model.train()  # Set the model to training mode
    total_loss = 0
    print_count = 1
    print(print_count)
    print_count = print_count + 1
    for i in range(0, len(train_dataset), batch_size):
        batch_inputs = (
            train_dataset.tensors[0][i:i+batch_size],
            train_dataset.tensors[1][i:i+batch_size]
        )
        batch_labels = train_dataset.tensors[2][i:i+batch_size]

        optimizer.zero_grad()
        print("mid")
        outputs = model(*batch_inputs)
        loss = criterion(outputs.logits, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    avg_loss = total_loss / (len(train_dataset) / batch_size)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_loss}")

In [None]:
# After training, you might want to evaluate on the validation set
model.eval()  # Set the model to evaluation mode
val_loss = 0

with torch.no_grad():
    for i in range(0, len(val_dataset), batch_size):
        batch_inputs = (
            val_dataset.tensors[0][i:i+batch_size],
            val_dataset.tensors[1][i:i+batch_size]
        )
        batch_labels = val_dataset.tensors[2][i:i+batch_size]

        outputs = model(*batch_inputs)
        loss = criterion(outputs.logits, batch_labels)
        val_loss += loss.item()

avg_val_loss = val_loss / (len(val_dataset) / batch_size)
print(f"Validation Loss: {avg_val_loss}")