# Data Cleaning for Avatar: The Last Airbender Dataset
    by Adam Ward

In [25]:
import pandas as pd 
import re
import matplotlib.pyplot as plt

In [26]:
# load in the data
data = pd.read_csv("ATLA-episodes-scripts.csv")

In [27]:
# cleaning function from the Kaggle notebook
def clean_tweet(tweet):
    if type(tweet) == float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    
    return temp

In [28]:
# clean the data and put it in the correct format for the model
data['script'] = data['script'].map(lambda x : clean_tweet(x))

In [29]:
# helper function for finding specific character lines
def create_individual_mask(substring, full_string_column):
    # Use apply to check each element in the column
    return full_string_column.apply(lambda x: bool(re.search(re.escape(substring), x)))

# helper function for creating a datafram of only a certain list of characters
def create_full_mask(substrings, full_string_column):
    # Create a regex pattern from the list of substrings
    pattern = '|'.join(re.escape(substring) for substring in substrings)
    
    # Use apply to check each element in the column
    return full_string_column.apply(lambda x: False if re.search(r'\b(Actor|Actress)\b', x) else bool(re.search(pattern, x)))

# drop nan rows containing descriptions and the Book and episode columns
data.dropna(inplace=True)
data.drop(columns=["Book", "ep_number"], inplace=True)

# rename the script column to match the Kaggle dataset
data.rename(columns={"script":"Text"}, inplace=True)

In [None]:
# create a dataset with only the main characters
main_chars = ["Sokka", "Katara", "Zuko", "Iroh", "Aang", "Toph", "Azula"]
sub_data = data[create_full_mask(main_chars, data["Character"])]

# create the Label column and fill it with temporary values
sub_data.loc[:,"Label"] = "positive"

# export dataset to CSV for analysis on Kaggle notebook
sub_data.to_csv("atla_script.csv")