# Data Cleaning for Avatar: The Last Airbender Dataset
    by Adam Ward

In [2]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords 
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from transformers import BertTokenizer
# from transformers import BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
# load in the data
data = pd.read_csv("ATLA-episodes-scripts.csv")
data.tail(5)

Unnamed: 0,Character,script,ep_number,Book,total_number
13364,Suki,And why did you paint me firebending?,21,3,61
13365,Sokka,I thought it looked more exciting that way. [M...,21,3,61
13366,Iroh,"[Points at painting.] Hey, my belly's not that...",21,3,61
13367,Toph,Well I think you all look perfect! [They laugh.],21,3,61
13368,,"Aang walks past Appa, petting him briefly, bef...",21,3,61


In [37]:
def clean_tweet(tweet):
    if type(tweet) == float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    
    return temp

# clean the data and put it in the format for the model
data['script'] = data['script'].map(lambda x : clean_tweet(x))

In [38]:
# helper function for finding specific character lines
def create_individual_mask(substring, full_string_column):
    # Use apply to check each element in the column
    return full_string_column.apply(lambda x: bool(re.search(re.escape(substring), x)))

# helper function for creating a datafram of only a certain list of characters
def create_full_mask(substrings, full_string_column):
    # Create a regex pattern from the list of substrings
    pattern = '|'.join(re.escape(substring) for substring in substrings)
    
    # Use apply to check each element in the column
    return full_string_column.apply(lambda x: False if re.search(r'\b(Actor|Actress)\b', x) else bool(re.search(pattern, x)))

# drop nan rows containing descriptions
data.dropna(inplace=True)
data.drop(columns=["Book", "ep_number"], inplace=True)

# rename the script column to match the other dataset
data.rename(columns={"script":"Text"}, inplace=True)
print(data.columns)

Index(['Character', 'Text', 'total_number'], dtype='object')


In [39]:
# create the script with only main characters
main_chars = ["Sokka", "Katara", "Zuko", "Iroh", "Aang", "Toph", "Azula"]
print(data[create_full_mask(main_chars, data["Character"])]["Character"].unique())

# create the Label column
data["Label"] = pd.Series()
data.columns

['Sokka' 'Katara' 'Zuko' 'Iroh' 'Aang' 'Aang and Sokka' 'Aang:'
 'Gyatso and Katara' 'Young Zuko' 'Azula' 'Toph' 'Young Azula'
 'Katara (flashback)' 'Aang and Zuko' 'Young Katara' 'Toph and Sokka'
 'Katara and Sokka']


Index(['Character', 'Text', 'total_number', 'Label'], dtype='object')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'positive': 0,
          'negative': 1,
          'uncertainty': 2,
          'litigious': 3,
          }

In [17]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['Label']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['Text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

## Predict

In [None]:
def predict(model, new_data, model_path, batch_size=2, device='cuda' if torch.cuda.is_available() else 'cpu'):

    # Load the pretrained model's weights
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()  # Set model to evaluation mode

    # Prepare the new dataset
    new_dataset = Dataset(new_data)
    new_dataloader = DataLoader(new_dataset, batch_size=batch_size, shuffle=False)

    predictions = []

    with torch.no_grad():
        for data_input in tqdm(new_dataloader):
            mask = data_input['attention_mask'].to(device)
            input_id = data_input['input_ids'].squeeze(1).to(device)

            # Forward pass to get predictions
            output = model(input_id, mask)

            # Get the predicted labels (assuming it's a classification model)
            predicted_labels = output.argmax(dim=1)

            # Store the predictions
            predictions.extend(predicted_labels.cpu().numpy())

    return predictions
