In [1]:
import os

# Change the working directory
os.chdir("/Users/sbrya")

cwd = os.getcwd()
print(cwd)

c:\Users\sbrya


# Exploring Episodes in *One Piece*

## Importing the Data

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import plotly.graph_objects as go
import plotly.express as px
from statsmodels.nonparametric.smoothers_lowess import lowess  # For LOESS

# Download latest version
path = "Downloads/ONE PIECE.csv"

df = pd.read_csv(path)

print(df.head(10).to_markdown())

|    |   Unnamed: 0 | rank   |   trend |   season |   episode | name                                                     |   start |   total_votes |   average_rating |
|---:|-------------:|:-------|--------:|---------:|----------:|:---------------------------------------------------------|--------:|--------------:|-----------------:|
|  0 |            0 | 24,129 |      18 |        1 |         1 | I'm Luffy! The Man Who Will Become the Pirate King!      |    1999 |           647 |              7.6 |
|  1 |            1 | 29,290 |      11 |        1 |         2 | The Great Swordsman Appears! Pirate Hunter, Roronoa Zoro |    1999 |           473 |              7.8 |
|  2 |            2 | 32,043 |       7 |        1 |         3 | Morgan vs. Luffy! Who's This Beautiful Young Girl?       |    1999 |           428 |              7.7 |
|  3 |            3 | 28,818 |       8 |        1 |         4 | Luffy's Past! The Red-haired Shanks Appears!             |    1999 |           449 |            

We can first see that, the index has a duplicated "unnamed" id column. Also, while the first six episodes are in English, all following episodes use their orginal Japanese titles. We want to fix this descrepancy.

In [3]:
url = "https://listfist.com/list-of-one-piece-anime-episodes"

r = requests.get(url)

print(r)

<Response [200]>


In [4]:
http = r.text

soup = BeautifulSoup(http)

pretty_soup = soup.prettify()
title = soup.title

print(title)
td_tags = soup.find_all("td", ["col-1 odd", "col-2 even", "col-3 odd"])

i = 0

episodes = []
titles = []
release_dates = []

for td in td_tags:
    if i%3 == 0:
        episodes.append(td.text)
    elif i%3 == 1:
        titles.append(td.text)
    else:
        release_dates.append(td.text)
    i += 1

for index in range(len(episodes)):
    print(f"Episode {episodes[index]}:  '{titles[index]}'  |  {release_dates[index]}")
    if index == 10:
        break

print(f"\nLength: {len(episodes)}")

<title>List of One Piece Anime Episodes - ListFist.com</title>
Episode 1:  'I'm Luffy! The Man Who Will Become the Pirate King!'  |  October 20, 1999
Episode 2:  'Enter The Great Swordsman! Pirate Hunter Roronoa Zoro!'  |  November 17, 1999
Episode 3:  'Morgan vs. Luffy! Who's This Mysterious Beautiful Young Girl?'  |  November 24, 1999
Episode 4:  'Luffy's Past! The Red-Haired Shanks Appears!'  |  December 8, 1999
Episode 5:  'Fear, Mysterious Power! Pirate Clown Captain Buggy!'  |  December 15, 1999
Episode 6:  'Desperate Situation! Beast Tamer Mohji vs. Luffy!'  |  December 29, 1999
Episode 7:  'Grand Duel! Zoro the Swordsman vs. Cabaji the Acrobat!'  |  December 29, 1999
Episode 8:  'Who Will Win? Showdown Between the True Powers of the Devil Fruit!'  |  December 29, 1999
Episode 9:  'Honorable Liar? Captain Usopp'  |  January 12, 2000
Episode 10:  'The World's Strongest Weirdo! Jango the Hypnotist!'  |  January 19, 2000
Episode 11:  'Revealing the Conspiracy! The Pirate Caretaker,

In [5]:
df.rename(columns={df.columns[0]: "id"}, inplace=True)

# Creating a temporary DataFrame from the scraped lists
episodes_df = pd.DataFrame({
    'episode': pd.to_numeric(episodes),
    'title': titles,
    'release_date': release_dates
})

# Integrating the scraped data using a merge
df = pd.merge(df, episodes_df, left_on='episode', right_on='episode', how='left')

In [6]:
# Overwriting old episode names with the new ones
df['name'] = df['title']
df.drop('title', axis=1, inplace=True)

# Trim any/all whitespace from episode titles
df['name'] = df['name'].str.strip()

# Clean and convert 'total_votes' column to numeric
df['total_votes'] = df['total_votes'].str.replace(',', '', regex=False).astype(float)

# Date format at position 863 contains a period
df['release_date'] = df['release_date'].str.replace('.', ',')

# Parse datetimes from strings
df['release_date'] = pd.to_datetime(df['release_date'], format='%B %d, %Y')



print(df.head(10).to_markdown(), "\n")
print(df.info(), "\n")
print(df.describe(include='all').to_markdown())

|    |   id | rank   |   trend |   season |   episode | name                                                               |   start |   total_votes |   average_rating | release_date        |
|---:|-----:|:-------|--------:|---------:|----------:|:-------------------------------------------------------------------|--------:|--------------:|-----------------:|:--------------------|
|  0 |    0 | 24,129 |      18 |        1 |         1 | I'm Luffy! The Man Who Will Become the Pirate King!                |    1999 |           647 |              7.6 | 1999-10-20 00:00:00 |
|  1 |    1 | 29,290 |      11 |        1 |         2 | Enter The Great Swordsman! Pirate Hunter Roronoa Zoro!             |    1999 |           473 |              7.8 | 1999-11-17 00:00:00 |
|  2 |    2 | 32,043 |       7 |        1 |         3 | Morgan vs. Luffy! Who's This Mysterious Beautiful Young Girl?      |    1999 |           428 |              7.7 | 1999-11-24 00:00:00 |
|  3 |    3 | 28,818 |       8 |        

### Import data on story arcs 

In [7]:
path = "Downloads/OnePieceArcs.csv"

arcs = pd.read_csv(path)

In [8]:
# Creating a tag for canon and non-canon arcs, based on whether that arc exists in the manga
arcs['Function'] = np.where(arcs['Start onChapter'] == 0, 'filler', 'canon')

##### The "Saga" column
As the series has been going on for over 25 years, breaking down all the episodes into its story arcs doesn't work as well as most other shorter-running series. It's noticeable that the number of of arcs builds up over time. However, there are still large narrative chunks of the story that strongly pertain to one another in large groupings. "East "Blue" acts as the effective introduction and prologue to the greater series. "Ennies Lobby" is often seen as the second half to "Water Seven"; a similar relationship exists between "Impel Down" and "Marineford", among other story arcs.

Toei Animation (or whoever sells the hard copies of the series) also takes note of this, and has also grouped and marketed cohesive sections of the series into larger collections. There are no explicit "seasons" with which to effectively group episodes either, as the show has been undergoing weekly development and release schedules year-round for 25 years up until recently (the animation recieved a three month break between 2024 and 2025). Instead, multiple story arcs and there various sub-sections have been grouped and marketed as larger boxsets, affectionately named "Eternal Logs" in reference to the show. We'll use these collections as our grouping criterias for tying together related story arcs, with some modification.

Some collections ("Dressrosa", "Whole Cake Island", and "Wano") are large enough that their Eternal Log sets have been further broken into subparts. We're going to keep these subparts grouped together. I'm also going to alias these larger groupings of story arcs as "sagas" in the tradition of the first collection, the "East Blue Sage" (it's also a much tidier name for data manipulation and graphs). So, to recap:

*one saga = many arcs*

We want to categorize arcs according the the following groupings:
- East Blue: 0-7
- Alabasta: 8-14
- Skypeia: 15-22
- Water Seven: 23-25
- Thriller Bark: 26-31
- Marineford: 32-35
- Fishman Island: 36-38
- Punk Hazard: 39-40
- Dressrosa: 41-42
- Whole Cake Island: 43-46
- Wano: 47-50

In [9]:
# Define the arc ranges and corresponding saga names
saga_mapping = {
    'East Blue': range(0, 8),
    'Alabasta': range(8, 15),
    'Skypeia': range(15, 23),
    'Water Seven': range(23, 26),
    'Thriller Bark': range(26, 32),
    'Marineford': range(32, 36),
    'Fishman Island': range(36, 39),
    'Punk Hazard': range(39, 41),
    'Dressrosa': range(41, 43),
    'Whole Cake Island': range(43, 47),
    'Wano': range(47, 51)
}

# Function to apply saga based on index
def get_saga(index):
    for saga, index_range in saga_mapping.items():
        if index in index_range:
            return saga

# Assuming 'arcs' DataFrame already exists, apply the 'Saga' column
arcs['Saga'] = arcs.index.map(get_saga)

In [10]:
# Drop the specified columns from the dataframe
arcs = arcs.drop(columns=["Start onChapter", "TotalChapters", "TotalPages", "Manga%", "TotalMinutes(avg 24)"])

In [11]:
# Set the 'Arc' and 'Saga' columns to an ordinal Categorical datatypes
arcs['Arc'] = pd.Categorical(arcs['Arc'], categories=arcs['Arc'].unique(), ordered=True)
arcs['Saga'] = pd.Categorical(arcs['Saga'], categories=arcs['Saga'].unique(), ordered=True)

print(arcs.to_markdown())

|    | Arc                     |   Start onEpisode |   TotalEpisodes | Anime%   | Function   | Saga              |
|---:|:------------------------|------------------:|----------------:|:---------|:-----------|:------------------|
|  0 | Romance Dawn Arc        |                 1 |               3 | 0.3%     | canon      | East Blue         |
|  1 | Orange Town Arc         |                 4 |               5 | 0.5%     | canon      | East Blue         |
|  2 | Syrup Village Arc       |                 9 |              10 | 1.0%     | canon      | East Blue         |
|  3 | Baratie Arc             |                19 |              12 | 1.2%     | canon      | East Blue         |
|  4 | Arlong Park Arc         |                31 |              15 | 1.5%     | canon      | East Blue         |
|  5 | Buggy Side Story Arc    |                46 |               2 | 0.2%     | filler     | East Blue         |
|  6 | Loguetown Arc           |                48 |               6 | 0.6%     

In [12]:
# Generate episode ranges dynamically based on 'Start onEpisode'
episode_ranges = []
for i in range(len(arcs) - 1):
    start_episode = arcs.loc[i, 'Start onEpisode']
    next_start_episode = arcs.loc[i + 1, 'Start onEpisode']
    episode_ranges.append((start_episode, next_start_episode - 1, i))  # (start, end, arc_id)

# Include the last arc, which ends at the last episode
episode_ranges.append((arcs.loc[len(arcs) - 1, 'Start onEpisode'], float('inf'), len(arcs) - 1))

# Create a function to map episodes to arcs
def get_arc(episode):
    for start, end, arc in episode_ranges:
        if start <= episode <= end:
            return arc
    return None  # Return None if episode is out of defined ranges

# Apply the function to the df['episode'] column to create a new 'arc' id column
df['arc'] = df['episode'].apply(get_arc)

In [13]:

# Merging the dataframes based on the 'arc' id and 'arcs' index
df = pd.merge(df, arcs, left_on='arc', right_index=True, how='left')

# Dropping unnecessary columns
df = df.drop(columns=['id', 'rank', 'trend', 'season', 'start', 'arc', 'Start onEpisode', 'TotalEpisodes','Anime%'])


# Sneak-peek the resulting dataframe
print(df.head().to_markdown())


|    |   episode | name                                                          |   total_votes |   average_rating | release_date        | Arc              | Function   | Saga      |
|---:|----------:|:--------------------------------------------------------------|--------------:|-----------------:|:--------------------|:-----------------|:-----------|:----------|
|  0 |         1 | I'm Luffy! The Man Who Will Become the Pirate King!           |           647 |              7.6 | 1999-10-20 00:00:00 | Romance Dawn Arc | canon      | East Blue |
|  1 |         2 | Enter The Great Swordsman! Pirate Hunter Roronoa Zoro!        |           473 |              7.8 | 1999-11-17 00:00:00 | Romance Dawn Arc | canon      | East Blue |
|  2 |         3 | Morgan vs. Luffy! Who's This Mysterious Beautiful Young Girl? |           428 |              7.7 | 1999-11-24 00:00:00 | Romance Dawn Arc | canon      | East Blue |
|  3 |         4 | Luffy's Past! The Red-Haired Shanks Appears!                 

In [14]:
# Update the 'Function' column based on the episode numbers
episode_updates = {
    61: 'canon',
    (227, 228): 'canon',
    (279, 283): 'filler',
    (291, 292): 'filler',
    303: 'filler',
    (430, 456): 'canon',
    (457, 458): 'filler',
    492: 'filler',
    542: 'filler',
    590: 'filler',
    (897, 906): 'canon',
    (908, 917): 'canon'
}

# Applying the updates to the 'Function' column
for key, value in episode_updates.items():
    if isinstance(key, tuple):
        # Update a range of episodes
        df.loc[df['episode'].between(key[0], key[1]), 'Function'] = value
    else:
        # Update a single episode
        df.loc[df['episode'] == key, 'Function'] = value


# Update mislabeled episodes in Wano, part 1"
df.loc[896:916, 'Arc'] = "Wano Country Arc: Act 1"
# Update mislabeled episodes in Impel Down Arc"
df.loc[429:455, 'Arc'] = "Impel Down Arc"

# Improperly-labeled 'total_votes' outlier for episode #957
df.total_votes.iloc[956] = 280
print(df.iloc[956])

episode                                              957
name              Big News! The Warlords Attack Incident
total_votes                                        280.0
average_rating                                       9.1
release_date                         2021-01-10 00:00:00
Arc                              Wano Country Arc: Act 2
Function                                           canon
Saga                                                Wano
Name: 956, dtype: object


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df.total_votes.iloc[956] = 280
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.total_votes.iloc[956] = 280


## Visualizations

### Canon episodes vs. Filler episodes.

In [15]:
# Define a color mapping to discern canon and filler episodes
function_color_map = {
    'canon': px.colors.qualitative.Prism[1],  # (blue)
    'filler': px.colors.qualitative.Prism[6]  # (orange)
}

#### Average rating of episodes over time

In [16]:
# Defining the 'line_color' to reference px.color calls in `function_color_map`
df['line_color'] = np.where(df['Function'] == 'canon', 1, 6)


# Create the figure
fig = go.Figure()


# Creating the LOESS trend line to-be-plotted
loess_result = lowess(df['average_rating'], df['release_date'], frac=0.1)
loess_y = loess_result[:, 1]

# Plotting LOESS trend line to the figure
fig.add_trace(go.Scatter(
    x=df['release_date'], 
    y=loess_y, 
    mode='lines', 
    name='LOESS Trend Line',
    line={"color":'grey', "width":3},
    hoverinfo='skip',
    showlegend=False
))


# Add the line plot with alternating colors based on 'line_color'
for tn in range(len(df) - 1):  # Adjust range to avoid out of bounds
    fig.add_trace(go.Scatter(
        x=df['release_date'][tn:tn + 2].values, 
        y=df['average_rating'][tn:tn + 2].values, 
        mode='lines',
        line=dict(color=px.colors.qualitative.Prism[df['line_color'][tn]]),
        showlegend=False,
        hovertemplate=(
            'Name: ' + df['name'].iloc[tn] + '<br>'  # Hover info for 'name'
            'Episode: ' + str(df['episode'].iloc[tn]) + '<br>'  # Hover info for 'episode'
            'Arc: ' + str(df['Arc'].iloc[tn]) + '<br>'  # Hover info for 'arc'
            'Saga: ' + str(df['Saga'].iloc[tn]) + '<br>'  # Hover info for 'arc'
            'Function: ' + df['Function'].iloc[tn] + '<br>'  # Hover info for 'Function'
            'Release Date: %{x}<br>'  # Show the average rating for the point
            'Average Rating: %{y}<br>'  # Show the average rating for the point
            '<extra></extra>'  # Remove the extra trace information
        )
    ))

# Add a custom annotation at episode 808
fig.add_annotation(
    x=df[df['episode'] == 808]['release_date'].values[0].astype(str),  # Position of the annotation on the x-axis (episode number)
    y=df[df['episode'] == 808]['average_rating'].values[0],  # Get corresponding y-value (total_votes) at episode 808
    text=f"Top-rated episode:\n\n'{df[df['episode'] == 808]['name'].values[0]}'"  # The text of the annotation
)


# Update the layout to add title and labels
fig.update_layout(
    title='Average Rating of One Piece Episodes Over Time',
    xaxis_title='Release Date',
    yaxis_title='Average Rating',
    hovermode='closest',
    yaxis=dict(range=[3.5, 10]),
    xaxis=dict(
        rangeslider=dict(  # Range slider settings
            visible=True,
            thickness=0.05,
            bordercolor='grey',
            borderwidth=1,
            bgcolor='rgba(0, 0, 0, 0.1)'
        )
    )
)

# Show the figure
fig.show()


In [17]:
df[df['episode'] == 808]['release_date'].values[0].astype(str)


'2017-10-01T00:00:00.000000000'

In [18]:
canon_eps = df[df.Function=='canon'].sort_values(by='episode')

# Identify where there is a gap greater than 1 between consecutive episodes
canon_eps['group'] = (canon_eps['episode'].diff() > 1).cumsum()
canon_eps['group'] = 'C' + canon_eps['group'].astype(str)

# Get the highest and lowest episode numbers for each group
canon_group = canon_eps.groupby(['group'])['episode'].agg(
    min_ind=lambda x: x.min() - 1,
    max_ind=lambda x: x.max()
).reset_index()

canon_group['color'] = 1

# Merge the min and max values back to the original dataframe
canon_eps = pd.merge(canon_eps, canon_group, on='group', how='left')

# Now you can print the name and episode along with the corresponding group
print(canon_group)

   group  min_ind  max_ind  color
0     C0        0       45      1
1     C1       47       53      1
2    C10      336      381      1
3    C11      384      405      1
4    C12      407      425      1
5    C13      429      456      1
6    C14      458      491      1
7    C15      492      541      1
8    C16      542      574      1
9    C17      578      589      1
10   C18      590      625      1
11   C19      628      746      1
12    C2       60       67      1
13   C20      750      779      1
14   C21      782      894      1
15   C22      896      906      1
16   C23      907      958      1
17    C3       69      130      1
18    C4      143      195      1
19    C5      206      219      1
20    C6      226      278      1
21    C7      283      290      1
22    C8      292      302      1
23    C9      303      325      1


In [19]:
filler_eps = df[df.Function=='filler'].sort_values(by='episode')

# Identify where there is a gap greater than 1 between consecutive episodes
filler_eps['group'] = (filler_eps['episode'].diff() > 1).cumsum()
filler_eps['group'] = 'F' + filler_eps['group'].astype(str)


# Get the highest and lowest episode numbers for each group
filler_group = filler_eps.groupby('group')['episode'].agg(
    min_ind=lambda x: x.min() - 1,
    max_ind=lambda x: x.max()
).reset_index()

filler_group['color'] = 6

# Merge the min and max values back to the original dataframe
filler_eps = pd.merge(filler_eps, filler_group, on='group', how='left')

# Now you can print the name and episode along with the corresponding group
print(filler_group)
# print(filler_eps[['name', 'episode', 'group', 'min', 'max']].head(50))

   group  min_ind  max_ind  color
0     F0       45       47      6
1     F1       53       60      6
2    F10      381      384      6
3    F11      405      407      6
4    F12      425      429      6
5    F13      456      458      6
6    F14      491      492      6
7    F15      541      542      6
8    F16      574      578      6
9    F17      589      590      6
10   F18      625      628      6
11   F19      746      750      6
12    F2       67       69      6
13   F20      779      782      6
14   F21      894      896      6
15   F22      906      907      6
16    F3      130      143      6
17    F4      195      206      6
18    F5      219      226      6
19    F6      278      283      6
20    F7      290      292      6
21    F8      302      303      6
22    F9      325      336      6


In [20]:
# Assuming canon_group and filler_group are already defined
segments = pd.concat([canon_group, filler_group], ignore_index=True)

# Drop 'c_group' and 'f_group' columns from the unioned DataFrame
segments = segments.sort_values(by='min_ind').reset_index(drop=True)

# Display the resulting DataFrame
print(segments)

   group  min_ind  max_ind  color
0     C0        0       45      1
1     F0       45       47      6
2     C1       47       53      1
3     F1       53       60      6
4     C2       60       67      1
5     F2       67       69      6
6     C3       69      130      1
7     F3      130      143      6
8     C4      143      195      1
9     F4      195      206      6
10    C5      206      219      1
11    F5      219      226      6
12    C6      226      278      1
13    F6      278      283      6
14    C7      283      290      1
15    F7      290      292      6
16    C8      292      302      1
17    F8      302      303      6
18    C9      303      325      1
19    F9      325      336      6
20   C10      336      381      1
21   F10      381      384      6
22   C11      384      405      1
23   F11      405      407      6
24   C12      407      425      1
25   F12      425      429      6
26   C13      429      456      1
27   F13      456      458      6
28   C14      

In [21]:
# Create an empty figure
fig = go.Figure()

# Compute the LOESS trend line (smoothed)
loess_result = lowess(df['average_rating'], df['release_date'], frac=0.1)  # 'frac' controls the smoothness

# Extract smoothed values
loess_y = loess_result[:, 1]

# Add the LOESS trend line (previously the rolling average line) with a custom color
fig.add_trace(go.Scatter(x=df['release_date'], y=loess_y, mode='lines', line={"color":'grey'}, hoverinfo='skip'))
# fig.add_trace(go.Scatter(x=df['release_date'], y=loess_y, mode='lines', line={"color":'white', "width":2}, hoverinfo='skip'))


# Loop through each row in the segments DataFrame to plot each segment
for _, row in segments.iterrows():
    # Extract segment information
    min_ind = row['min_ind']
    max_ind = row['max_ind']
    color = px.colors.qualitative.Prism[row['color']]  # Use color index from `px.colors.qualitative.Prism`
    
    # Select the data for the current segment
    segment_data = df.iloc[min_ind:max_ind+1]
    
    # Add the segment as a trace to the figure
    fig.add_scatter(
        x=segment_data['release_date'], 
        y=segment_data['average_rating'], 
        mode='lines',
        line=dict(color=color),  # Set the color based on the 'color' column
        name="",  # Name the trace based on the group identifier
        hovertemplate=(
            '<b>Episode %{customdata[1]} (%{customdata[4]}):'
            '<br>"%{customdata[0]}"</b><br><br>'
            '   - <i>Release Date</i>: %{x}<br>'
            '   - <i>Average Rating</i>: %{y}<br>'
            '   - <i>Saga</i>: %{customdata[3]}<br>'
            '         - <i>Arc</i>: %{customdata[2]}<br> '
        ),
        customdata=segment_data[['name', 'episode', 'Arc', 'Saga', 'Function']].values
    )

# Update the layout with the title and labels
fig.update_layout(
    title='Average Rating of One Piece Episodes Over Time',
    xaxis_title='Release Date',
    yaxis_title='Average Rating',
    showlegend=False,
    yaxis=dict(range=[3.5, 10]),  # Set y-axis range
    xaxis=dict(rangeslider=dict(thickness=0.05)) # slider settings
)

# Show the plot
fig.show()

#### Amount of votes by episode

In [22]:
# Create the figure
fig = go.Figure()

# Add the line plot with alternating colors based on 'line_color' for 'total_votes'
for tn in range(len(df) - 1):  # Adjust range to avoid out of bounds
    fig.add_trace(go.Scatter(
        x=df['episode'][tn:tn + 2].values,  # Now using 'episode' for x-values
        y=df['total_votes'][tn:tn + 2].values,  # Now using 'total_votes' for y-values
        mode='lines',
        line=dict(color=px.colors.qualitative.Prism[df['line_color'][tn]]),
        showlegend=False,
        hovertemplate=(
            'Name: ' + df['name'].iloc[tn] + '<br>'  # Hover info for 'name'
            'Episode: ' + str(df['episode'].iloc[tn]) + '<br>'  # Hover info for 'episode'
            'Arc: ' + str(df['Arc'].iloc[tn]) + '<br>'  # Hover info for 'arc'
            'Saga: ' + str(df['Saga'].iloc[tn]) + '<br>'  # Hover info for 'arc'
            'Function: ' + df['Function'].iloc[tn] + '<br>'  # Hover info for 'Function'
            'Episode Number: %{x}<br>'  # Show the episode number for the point
            'Total Votes: %{y}<br>'  # Show the total votes for the point
            '<extra></extra>'  # Remove the extra trace information
        )
    ))

# Update the layout to add title and labels, with y-axis range from 0 to 800
fig.update_layout(
    title='Total Votes of One Piece Episodes by Episode Number',
    xaxis_title='Episode Number',  # Label for x-axis
    yaxis_title='Total Votes',  # Label for y-axis
    hovermode='closest'  # Ensure hover shows data for the closest point
)

# Show the figure
fig.show()


In [23]:
import plotly.graph_objects as go
import plotly.express as px

# Assuming df is your DataFrame and has columns 'episode', 'total_votes', 'line_color', etc.

# Create the figure
fig = go.Figure()

# Add the line plot with alternating colors based on 'line_color' for 'total_votes'
for tn in range(len(df) - 1):  # Adjust range to avoid out of bounds
    fig.add_trace(go.Scatter(
        x=df['episode'][tn:tn + 2].values,  # Now using 'episode' for x-values
        y=df['total_votes'][tn:tn + 2].values,  # Now using 'total_votes' for y-values
        mode='lines',
        line=dict(color=px.colors.qualitative.Prism[df['line_color'][tn]]),
        showlegend=False,
        hovertemplate=(  # Hover info for each trace
            'Name: ' + df['name'].iloc[tn] + '<br>'  # Hover info for 'name'
            'Episode: ' + str(df['episode'].iloc[tn]) + '<br>'  # Hover info for 'episode'
            'Arc: ' + str(df['Arc'].iloc[tn]) + '<br>'  # Hover info for 'arc'
            'Saga: ' + str(df['Saga'].iloc[tn]) + '<br>'  # Hover info for 'arc'
            'Function: ' + df['Function'].iloc[tn] + '<br>'  # Hover info for 'Function'
            'Episode Number: %{x}<br>'  # Show the episode number for the point
            'Total Votes: %{y}<br>'  # Show the total votes for the point
            '<extra></extra>'  # Remove the extra trace information
        )
    ))

# Add a custom annotation at episode 808
fig.add_annotation(
    x=df[df['episode'] == 808]['episode'].values[0],  # Position of the annotation on the x-axis (episode number)
    y=df[df['episode'] == 808]['total_votes'].values[0],  # Get corresponding y-value (total_votes) at episode 808
    text="Episode 808",  # The text of the annotation
    showarrow=True,  # Show an arrow pointing to the point
    arrowhead=2,  # Arrow style (optional)
    ax=0,  # X offset of the annotation (relative to the point)
    ay=-50,  # Y offset of the annotation (relative to the point)
    font=dict(size=12, color='black'),  # Font style for annotation text
    borderpad=4  # Padding around the annotation text
)

# Update the layout to add title and labels, with y-axis range from 0 to 800
fig.update_layout(
    title='Total Votes of One Piece Episodes by Episode Number',
    xaxis_title='Episode Number',  # Label for x-axis
    yaxis_title='Total Votes',  # Label for y-axis
    hovermode='closest',  # Ensure hover shows data for the closest point
    yaxis=dict(range=[0, 800])  # Set y-axis range from 0 to 800
)

# Show the figure
fig.show()


In [24]:
# Create the figure
fig = go.Figure()

# Add a boxplot trace for each 'Function'
for function in df['Function'].unique():
    function_data = df[df['Function'] == function]
    
    fig.add_trace(go.Box(
        x=function_data['Function'],  # x-axis is the 'Function' column
        y=function_data['average_rating'],  # y-axis is the 'average_rating'
        name=function,  # Set the name of the function for the legend
        boxmean='sd',  # Show the mean and standard deviation within the boxplot
        marker=dict(
            color=function_color_map.get(function, '#000000'),  # Use color map for each 'Function', default to black if not found
        ),
        customdata=function_data[['episode', 'name']].values,  # Pass episode and name as custom data
        hovertemplate=(
            'Function: %{x}<br>'  # Hover info for 'Function'
            'Average Rating: %{y}<br>'  # Show average rating for the boxplot
            'Episode %{customdata[0]}: %{customdata[1]}<br>'  # Show episode number (outlier point)
            '<extra></extra>'  # Remove extra trace information
        )
    ))

# Update layout
fig.update_layout(
    title='Average Rating Distribution by Function',  # Title of the plot
    xaxis_title='Function',  # x-axis title
    yaxis_title='Average Rating',  # y-axis title
    height=600,  # Set the figure height
    showlegend=True,  # Show legend for different functions
)

# Show the figure
fig.show()


In [25]:
# Create the histogram using Plotly Express and color by 'Function'
fig = px.histogram(df, 
                   x='average_rating',  # Data for the histogram
                   color='Function',  # Color by 'Function'
                   nbins=50,  # Number of bins (you can adjust this)
                   title='Histogram of Average Rating by Function',
                   labels={'average_rating': 'Average Rating', 'count': 'Count'},  # Axis labels
                   color_discrete_map=function_color_map  # Apply custom color map
)

# Show the figure
fig.show()

print(fig)

Figure({
    'data': [{'alignmentgroup': 'True',
              'bingroup': 'x',
              'hovertemplate': 'Function=canon<br>Average Rating=%{x}<br>count=%{y}<extra></extra>',
              'legendgroup': 'canon',
              'marker': {'color': 'rgb(29, 105, 150)', 'pattern': {'shape': ''}},
              'name': 'canon',
              'nbinsx': 50,
              'offsetgroup': 'canon',
              'orientation': 'v',
              'showlegend': True,
              'type': 'histogram',
              'x': array([7.6, 7.8, 7.7, ..., 8.2, 9.1, 9.4]),
              'xaxis': 'x',
              'yaxis': 'y'},
             {'alignmentgroup': 'True',
              'bingroup': 'x',
              'hovertemplate': 'Function=filler<br>Average Rating=%{x}<br>count=%{y}<extra></extra>',
              'legendgroup': 'filler',
              'marker': {'color': 'rgb(225, 124, 5)', 'pattern': {'shape': ''}},
              'name': 'filler',
              'nbinsx': 50,
              'offsetgroup

In [57]:
# Create the histogram for 'total_votes' from the filtered dataframe
fig = px.histogram(df, 
                   x='total_votes',  # Data for the histogram
                   color='Function',  # Color by 'Function'
                   nbins=100,  # Number of bins (you can adjust this)
                   labels={'total_votes': 'Total Votes', 'count': 'Count'},  # Axis labels
                   color_discrete_map=function_color_map  # Apply custom color map
)

# Update layout
fig.update_layout(
    title='Histogram of Total Votes (Filtered)',
    xaxis_title='Total Votes',  # x-axis title
    yaxis_title='Count',  # y-axis title (count of votes in each bin)
    height=600,  # Set the figure height
)

# Show the figure
fig.show()


61: canon
227-228: canon
279-283: filler
291-292: filler
303: filler
430-456: canon
457-458: filler
492: filler
542: filler
590: filler
897-906: canon
908-917: canon

#### Votes vs. Episode Rating

In [44]:
# Definining set of points for median line
median_df = df.groupby('average_rating')['total_votes'].median().reset_index()

In [None]:
fig = go.Figure()

# Scatterplot trace for canon episodes (blue dots)
fig.add_trace(go.Scatter(
    x=df[df['Function'] == 'canon']['average_rating'],
    y=df[df['Function'] == 'canon']['total_votes'],
    mode='markers',
    marker=dict(
        color=function_color_map['canon'],
        size=8,
        opacity=0.6
    ),
    hovertemplate=(
        '<b>Episode %{customdata[1]} (%{customdata[4]}):'
        '<br>"%{customdata[0]}"</b><br><br>'
        '   - <i>Release Date</i>: %{y}<br>'
        '   - <i>Average Rating</i>: %{x}<br>'
        '   - <i>Saga</i>: %{customdata[3]}<br>'
        '         - <i>Arc</i>: %{customdata[2]}<br> '
        "<extra></extra>"
    ),
    text=df[df['Function'] == 'canon']['episode'].astype(str),
    name='Canon',
    customdata=df[df['Function'] == 'canon'][['name', 'episode', 'Arc', 'Saga', 'Function']].values
))

# Scatterplot trace for filler episodes (orange dots)
fig.add_trace(go.Scatter(
    x=df[df['Function'] == 'filler']['average_rating'],
    y=df[df['Function'] == 'filler']['total_votes'],
    mode='markers',
    marker=dict(
        color=function_color_map['filler'],
        size=8,
        opacity=0.6
    ),
    hovertemplate=(
        '<b>Episode %{customdata[1]} (%{customdata[4]}):'
        '<br>"%{customdata[0]}"</b><br><br>'
        '   - <i>Release Date</i>: %{y}<br>'
        '   - <i>Average Rating</i>: %{x}<br>'
        '   - <i>Saga</i>: %{customdata[3]}<br>'
        '         - <i>Arc</i>: %{customdata[2]}<br> '
        "<extra></extra>"
    ),
    text=df[df['Function'] == 'filler']['episode'].astype(str),
    name='Filler',
    customdata=df[df['Function'] == 'filler'][['name', 'episode', 'Arc', 'Saga', 'Function']].values
))

# Add the trace to the figure
fig.add_trace(go.Scatter(
    y=median_df['total_votes'],
    x=median_df['average_rating'],
    line=dict(color='grey'),
    mode='lines',
    name='Median votes per discrete rating',
    hoverinfo='none' 
))

# Layout, Titles, and Formatting
fig.update_layout(
    title='Popularity of Canon and Filler Episodes',
    xaxis_title='Average Rating',
    yaxis_title='Number of Votes',
    hovermode='closest',
    yaxis=dict(range=[0, 750]),
    height=900,
    showlegend=True
)

# Legend settings
fig.update_layout(
    legend=dict(
        orientation='h',
        x=0.6,
        y=1.07,
        traceorder="normal",
        bgcolor="rgba(255, 255, 255, 0.7)",
        bordercolor="Black",
        borderwidth=1,
        xanchor="center",
        yanchor="middle"
    )
)

fig.show()


### Data by `Saga` category.

#### Color map

In [28]:
# Define the color mapping for each saga
saga_color_map = {
    'East Blue': px.colors.qualitative.Prism[1],
    'Alabasta': px.colors.qualitative.Dark24[6],
    'Skypeia': px.colors.qualitative.Dark24[14],
    'Water Seven': px.colors.qualitative.Dark24[22],
    'Thriller Bark': px.colors.qualitative.Dark24[17],
    'Marineford': px.colors.qualitative.Dark24[23],
    'Fishman Island': px.colors.qualitative.Dark24[10],
    'Punk Hazard': px.colors.qualitative.Dark24[15],
    'Dressrosa': px.colors.qualitative.Prism[6],
    'Whole Cake Island': px.colors.qualitative.Dark24[1],
    'Wano': px.colors.qualitative.Dark24[18]
}

#### Episode counts

In [30]:
# Group by 'Saga' and 'Arc', and count the number of unique episodes
arc_count = df.groupby(['Saga', 'Arc'])['episode'].nunique().reset_index()

# Sort the rows in narrative order
arc_count = arc_count.sort_values(['Saga', 'Arc'], ascending=[False, True])

# Initialize figure
fig = go.Figure()

# Create a separate bar trace for each saga using a `for` loop
for saga in arc_count['Saga'].unique():
    saga_data = arc_count[arc_count['Saga'] == saga]
    
    # Trace info here
    fig.add_trace(go.Bar(
        y=saga_data['Saga'],
        x=saga_data['episode'],
        name=saga,
        orientation='h',
        marker=dict(
            color=saga_color_map[saga]
        ),
        hovertemplate=(  # Hover info for the arc
            'Arc: %{customdata[0]}<br>'
            'Saga: %{y}<br>'
            'Number of Episodes: %{x}<br>'
            '<extra></extra>'
        ),
        customdata=saga_data[['Arc']].values,
        showlegend=False,
    ))

# Layout
fig.update_layout(
    title='Number of Episodes by Saga (Stacked by Arc)',
    xaxis_title='Number of Episodes',
    yaxis_title='Saga',
    barmode='stack',  # Stack the bars of the same saga
    hovermode='closest',
    yaxis=dict(
        categoryorder='array',
        categoryarray=arc_count['Saga'].unique()
    )
)

# Show figure
fig.show()






#### Average Rating

In [31]:
# Calculate the average rating per saga
avg_rating_by_saga = df.groupby('Saga')['average_rating'].mean().reset_index()

# Create the figure for the bar plot
fig = go.Figure()

# Add a bar trace
fig.add_trace(go.Bar(
    x=avg_rating_by_saga['Saga'],  # Saga on x-axis
    y=avg_rating_by_saga['average_rating'].round(1),  # Average rating on y-axis
    marker=dict(
        color=[saga_color_map[saga] for saga in avg_rating_by_saga['Saga']]  # Apply custom colors
    ),
    hovertemplate=(
        'Saga: %{x}<br>'  # Hover info for 'Saga'
        'Average Rating: %{y}<br>'  # Show the average rating for the bar
        '<extra></extra>'  # Remove extra trace information
    )
))

# Update layout to add title and axis labels
fig.update_layout(
    title='Average Rating by Saga',
    xaxis_title='Saga',
    yaxis_title='Average Rating',
    hovermode='closest',  # Ensure hover shows data for the closest point
    yaxis=dict(range=[5.5, 8.5])  # Set y-axis range
)

# Show the plot
fig.show()





In [64]:
# Calculate the average rating per saga
avg_rating_by_saga = df.groupby('Saga')['average_rating'].mean().reset_index()

# Create the figure for the boxplot
fig = go.Figure()

# Add a boxplot trace for each saga
for saga in avg_rating_by_saga['Saga']:
    saga_data = df[df['Saga'] == saga]  # Filter the DataFrame for the specific saga
    
    fig.add_trace(go.Box(
        x=saga_data['Saga'],  # Saga on x-axis
        y=saga_data['average_rating'],  # Ratings for the y-axis
        name=saga,  # Set the name for the trace (for legend)
        marker=dict(
            color=saga_color_map.get(saga, '#000000'),  # Apply custom colors based on saga
        ),
        boxmean='sd',
        customdata=saga_data[['episode', 'name', 'Arc', 'Function']].values,
        hovertemplate=(  # This hover info applies to outliers only
            '<b>Episode %{customdata[0]} (%{customdata[3]}):<br>'
            '"%{customdata[1]}"</b><br><br>'
            '   - <i>Average Rating: %{y}<br>'
            '   - <i>Saga: %{x}<br>'
            '      - <i>Arc: %{customdata[2]}<br> '
            '<extra></extra>'
        )
    ))

# Layout
fig.update_layout(
    title='Average Rating Distribution by Saga',  # Title of the plot
    xaxis_title='Saga',  # x-axis title
    yaxis_title='Average Rating',  # y-axis title
    hovermode='closest',  # Ensure hover shows data for the closest point
    yaxis=dict(range=[4, 10]),  # Set y-axis range (optional)
    showlegend=False  # Show legend for different sagas
)

# Show the plot
fig.show()





#### Saga bubble chart.

In [34]:
# Group by 'Saga' and aggregate the necessary data
saga_grouped = df.groupby('Saga').agg(
    total_votes=('total_votes', 'mean'),  # Get the average total_votes per Saga
    average_rating=('average_rating', 'mean'),  # Get the average rating per Saga
    episode_count=('episode', 'count')  # Count of episodes in each Saga
).reset_index()

# Create the figure for the bubble scatter plot
fig = go.Figure()

# Add a bubble scatter trace for each Saga
for _, row in saga_grouped.iterrows():
    # Assign the color based on the Saga
    color = saga_color_map.get(row['Saga'], 'gray')  # Default to 'gray' if Saga is not in the map
    
    # Add the trace for this particular Saga
    fig.add_trace(go.Scatter(
        x=[row['total_votes']],  # x-axis is average total votes per Saga
        y=[row['average_rating']],  # y-axis is average rating per Saga
        mode='markers',  # Using markers for the scatter plot
        marker=dict(
            color=color,  # Color by Saga
            size=(row['episode_count'] * 0.75)+6,  # Bubble size based on the episode count, scaled
            opacity=0.6  # Opacity of the bubbles
        ),
        hovertemplate=(
            'Saga: ' + str(row['Saga']) + '<br>'  # Convert 'Saga' to string safely
            'Average Rating: %{y}<br>'  # Show the average rating for the point
            'Total Votes: %{x}<br>'  # Show the average total votes for the point
            'Episode Count: ' + str(row['episode_count']) + '<br>'  # Show episode count for the point
            '<extra></extra>'  # Remove extra trace information
        ),
        text=[str(row['Saga'])],  # Pass the Saga as text to hover template (convert to string)
        name=row['Saga']  # Legend label for the current Saga
    ))



# Update the layout
fig.update_layout(
    title='Total Votes vs Average Rating by Saga (Bubble Scatter Plot)',
    xaxis_title='Total Votes (Mean per Saga)',
    yaxis_title='Average Rating (Mean per Saga)',
    hovermode='closest',  # Ensure hover shows data for the closest point
    xaxis=dict(range=[0, 400]),  # Set x-axis limits
    height=600,  # Set the figure height in pixels
    showlegend=True  # Enable the legend
)

# Update the layout with the horizontal legend
fig.update_layout(
    legend=dict(
        x=0.9,  # Centered horizontally
        y=0.5,  # Positioned slightly above the plot
        traceorder="normal",
        bgcolor="rgba(255, 255, 255, 0.7)",
        bordercolor="Black",
        borderwidth=1,
        xanchor="center",  # Anchor the legend box to the center horizontally
        yanchor="middle"  # Anchor the legend box to the middle vertically
    )
)

# Show the figure
fig.show()





In [104]:
# Group by 'Saga' and aggregate the necessary data 
saga_grouped = df.groupby('Saga').agg(
    total_votes=('total_votes', 'mean'),
    average_rating=('average_rating', 'mean'),
    episode_count=('episode', 'count')
).reset_index()

# Create the figure for the bubble scatter plot
fig = go.Figure()

# Add a bubble scatter trace for each Saga
for _, row in saga_grouped.iterrows():
    # Assign the color based on the Saga
    color = saga_color_map.get(row['Saga'], 'gray')  # Default to 'gray' if Saga is not in the map
    
    # Add the trace for this particular Saga
    fig.add_trace(go.Scatter(
        x=[row['average_rating']],
        y=[round(row['total_votes'])],
        mode='markers',
        marker=dict(
            color=color,
            size=(row['episode_count'] * 0.75)+6,
            opacity=0.6
        ),
        hovertemplate=(
            '<b>' + str(row['Saga']) + ' Saga</b><br>'
            ' - <i>Average Rating: %{customdata}</i><br>'
            ' - <i>Total Votes: %{y}</i><br>'
            ' - <i>Episode Count: ' + str(row['episode_count']) + '</i><br>'
            '<extra></extra>'
        ),
        text=[str(row['Saga'])],
        name=row['Saga'],
        customdata=[[round(row['average_rating'], 2)]]
    ))

# Layout
fig.update_layout(
    title='Average Rating vs Total Votes by Saga (Bubble Scatter Plot)',
    xaxis_title='Average Rating (Mean per Saga)',
    yaxis_title='Total Votes (Mean per Saga)',
    hovermode='closest',
    height=600,
    legend=dict(
        x=0.1,
        y=0.9,
        traceorder="normal",
        bgcolor="rgba(255, 255, 255, 0.7)",
        bordercolor="Black",
        borderwidth=1,
        xanchor="center",
        yanchor="top"
    )
)

# Show the figure
fig.show()






#### Arc bubble chart.

In [70]:
# Group by 'Arc' and aggregate necessary data
arc_grouped = df.groupby('Arc').agg(
    saga=('Saga', 'first'),
    total_votes=('total_votes', 'mean'),
    average_rating=('average_rating', 'mean'),
    episode_count=('episode', 'count')
).reset_index()

# Create the figure for the bubble scatter plot
fig = go.Figure()

# Add a single trace per Saga (one for each color)
for saga_name, saga_color in saga_color_map.items():
    saga_data = arc_grouped[arc_grouped['saga'] == saga_name]
    
    # Add all arcs of the same saga to a single trace
    fig.add_trace(go.Scatter(
        x=saga_data['average_rating'].round(1),
        y=saga_data['total_votes'].round(1),
        mode='markers',
        marker=dict(
            color=saga_color,
            size=(saga_data['episode_count'] * 0.75) + 6,
            opacity=0.6
        ),
        hovertemplate=(
            '<b>%{text}</b><br>'
            '   <i>Part of the ' + saga_name + ' story saga</i><br><br>'
            ' - <i>Average Rating</i>: %{x}<br>'
            ' - <i>Total Votes</i>: %{y}<br>'
            ' - <i>Episode Count</i>: ' + saga_data['episode_count'].astype(str) + '<br>'
            "<extra></extra>"
        ),
        text=saga_data['Arc'],
        name=saga_name
    ))

# Add the trace to the figure
fig.add_trace(go.Scatter(
    y=median_df['total_votes'],
    x=median_df['average_rating'],
    line=dict(color='grey'),
    mode='lines',
    name='Median Total Votes',
    hoverinfo='none' 
))

# Update layout to include the legend and improve the plot appearance
fig.update_layout(
    title='Average Rating vs Total Votes by Arc (Bubble Scatter Plot)',
    xaxis_title='Average Rating (Mean per Arc)',
    yaxis_title='Total Votes (Mean per Arc)',
    hovermode='closest',
    yaxis=dict(range=[0, 600]),
    height=900,
    showlegend=True,
    legend=dict(
        title="Saga",
        orientation='v',
        yanchor="top",
        y=0.9,
        xanchor="center",
        x=0.15
    )
)

# Show figure
fig.show()






In [37]:
# Create the figure for the bubble scatter plot
fig = go.Figure()

# Add a single trace per Saga (one for each color)
for saga_name, saga_color in saga_color_map.items():
    saga_data = df[df['Saga'] == saga_name]
    
    # Add all arcs of the same saga to a single trace
    fig.add_trace(go.Scatter(
        x=saga_data['average_rating'].round(1),
        y=saga_data['total_votes'].round(1),
        mode='markers',
        marker=dict(
            color=saga_color
        ),
        hovertemplate=(
            '<b>Episode %{customdata[1]} (%{customdata[4]}):'
            '<br>"%{customdata[0]}"</b><br><br>'
            '   - <i>Release Date</i>: %{x}<br>'
            '   - <i>Average Rating</i>: %{y}<br>'
            '   - <i>Saga</i>: %{customdata[3]}<br>'
            '         - <i>Arc</i>: %{customdata[2]}<br> '
            "<extra></extra>"
        ),
        text=saga_data['Arc'],
        name=saga_name,
        customdata=saga_data[['name', 'episode', 'Arc', 'Saga', 'Function']].values
    ))

# Add the trace to the figure
fig.add_trace(go.Scatter(
    y=median_df['total_votes'],
    x=median_df['average_rating'],
    line=dict(color='grey'),
    mode='lines',
    name='Median votes per discrete rating',
    hoverinfo='none' 
))

# Custom formatting
fig.update_layout(
    title='Episode Popularity, Colored by Story Saga',
    xaxis_title='Average Rating',
    yaxis_title='Total Votes',
    hovermode='closest',
    xaxis=dict(range=[5.5, 10]),
    yaxis=dict(range=[0, 600]),
    height=900,
    showlegend=True,
    legend=dict(  # legend settings
        title=None,
        orientation='v',
        yanchor="top",
        y=0.9,
        xanchor="center",
        x=0.15
    )
)

# Show figure
fig.show()
