In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib as plt
import seaborn as sns
from statistics import mean
from sklearn import pipeline
from sklearn import set_config, metrics
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import lru_cache
from google.colab import drive
import json

In [None]:
set_config(display="diagram")
sns.set(rc = {'figure.figsize':(15,8)})
sns.set_style("ticks")

In [None]:
# Drop rows with low line counts
def drop_low_linecount(df, thresh=10):
    for index in df.index:
        if df.at[index, 'Linecount'] < thresh:
            df.drop(index, axis=0, inplace=True)
    return df

# Create a dataframe for a show from json
def get_show_dataframe(all_series, show_initials):
    show = all_series[show_initials]
    show_series = pd.concat({k: pd.Series(v) for k, v in show.items()})
    df = pd.DataFrame(show_series).reset_index()
    df.columns = ['Episode', 'Character', 'Lines']
    df['Linecount'] = df['Lines'].str.len()
    return drop_low_linecount(df)

# df must contain all character names
def get_character_dataframe(df, character_name):
    return concat_character_dataframe(
        df[df["Character"] == character_name],
        character_name
    )

# df must only contain lines for a single character
def concat_character_dataframe(df, character_name):
    all_lines = [df.at[index, "Lines"] for index in df.index]
    all_lines = [line for line_list in all_lines for line in line_list]
    return pd.DataFrame({
        "Character" : character_name,
        "Lines" : all_lines
    })

# df must contain lines for all characters
def get_character_list(df):
    return df["Character"].unique()

# df must contain lines for all characters
def get_all_characters_dataframe(df):
    return pd.concat(
        [get_character_dataframe(df, character) for character in get_character_list(df)]
    ).reset_index(drop=True)

def remove_carriage_return(string):
    return re.sub('\\r', '', string)

def remove_all_carriage_returns(df):
    for index in df.index:
        df.at[index, "Lines"] = re.sub('\\r', '', df.at[index, "Lines"])
    return df

# df must contain lines for all characters
def get_top_n_characters(df, n):
    return pd.concat(
        [df[df["Character"] == character] for character in df["Character"].value_counts().index[:n]]
    )
        

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('/content/drive/MyDrive/StarTrekDialogue.json', 'r') as read_file:
    all_series = json.load(read_file)

In [None]:
show_list = ["TOS", "TNG", "DS9", "VOY", "ENT", "TAS"]

In [None]:
for show in show_list:
    df = get_show_dataframe(all_series, show)
    df = get_all_characters_dataframe(df)
    df = remove_all_carriage_returns(df)
    df = get_top_n_characters(df, 10)
    df.to_csv(f"/content/drive/MyDrive/star_trek_data/{show}.csv", index=False)
