In [None]:
import pandas as pd
import re
from datetime import datetime
import requests
import plotly.express as px
import plotly.io as pio

def fetch_html_content(url):
    """Fetch the content of an HTML file from a URL."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def extract_data_json(content):
    """Extract JSON data array from the HTML content."""
    pattern = re.compile(r'var datajson =\s*(\[\s*\[.*?\]\s*\]);', re.DOTALL)
    match = pattern.search(content)

    if not match:
        raise ValueError("No data found in the HTML file.")

    return match.group(1)

def convert_js_dates_and_nulls(data_json):
    """Convert JavaScript dates to Python datetime and replace nulls with None."""
    # Replace JavaScript date with Python datetime
    data_json = re.sub(r'new Date\((\d+),(\d+),(\d+)\)',
                       lambda m: f'datetime({m.group(1)},{int(m.group(2))+1},{m.group(3)})',
                       data_json)
    # Replace JavaScript null with Python None
    data_json = data_json.replace('null', 'None')
    return data_json

def evaluate_data_json(data_json):
    """Evaluate the JSON data array to convert it into a Python list."""
    return eval(data_json, {"datetime": datetime, "None": None})

def infer_columns(data):
    """Infer column names based on the first row of data."""
    num_columns = len(data[0])
    # Ensure the first two columns are 'ID' and 'Date'
    columns = ['ID', 'Date'] + [f'Feature_{i+3}' for i in range(num_columns - 2)]
    return columns

def create_dataframe(data):
    """Create a pandas DataFrame from the extracted data."""
    columns = infer_columns(data)
    return pd.DataFrame(data, columns=columns)

def main(url, to_csv):
    content = fetch_html_content(url)
    data_json = extract_data_json(content)
    data_json = convert_js_dates_and_nulls(data_json)
    data = evaluate_data_json(data_json)
    df = create_dataframe(data)

    if to_csv:
        df.to_csv('./extracted_data.csv', index=False)

    return df

# Path to the HTML file
url = 'https://users.nber.org/~dlchen/divMotionChartrollMean2.html'
# Execute main function
df = main(url = url, to_csv=False) # To save the CSV file
df.columns = ["ID", "Date", "SW120DeathSentences", "SW120execution", "SW120absence","DivType","SW120ExecutionRate", "SW120Casualties"]

print(df.shape)
df.head()

# Ensure the Date column is of datetime type
df['Date'] = pd.to_datetime(df['Date'])

# Sort data by Date
df.sort_values(by='Date', inplace=True)

# Check for duplicates and drop them
df.drop_duplicates(subset=['ID', 'Date'], inplace=True)

# Strip any leading or trailing whitespace from the DivType column
df['DivType'] = df['DivType'].str.strip()

# Check the unique values again to ensure they are clean
unique_div_types = df['DivType'].unique()
print(unique_div_types)

# Verify the presence of territorial data points
territorial_data = df[df['DivType'] == 'territorial']
print(f"Territorial data points: {len(territorial_data)}")

def makeChart(df, x='SW120DeathSentences', y='SW120execution', size='SW120Casualties'):
    cols2keep = [x, y, size, "Date", "DivType", "ID"]
    df = df.loc[:, cols2keep]
    print("orig dimensions:", df.shape)
    df = df.dropna()
    print("dimensions after dropping NAs:", df.shape)

    # Define a color map to ensure each DivType gets a distinct color
    color_map = {
        'regular': 'blue',
        'new': 'green',
        'territorial': 'red'
    }

    # Create the motion chart
    fig = px.scatter(
        df, x=x, y=y, animation_frame='Date', animation_group='ID',
        size=size, color='DivType', hover_name='ID', log_x=False,
        category_orders={"DivType": sorted(df['DivType'].unique())},
        color_discrete_map=color_map
    )

    # Update the layout to ensure the legend is fully displayed
    fig.update_layout(
        legend_title_text='Division Type',
        legend=dict(
            itemsizing='constant',
            title_font_size=16,
            font_size=14
        )
    )

    # Adjust the traces to ensure the legend is correct
    fig.for_each_trace(lambda t: t.update(name=t.name.split('=')[-1]))

    # Save the figure as an HTML file
    pio.write_html(fig, 'plotly_animation.html')

    # Show the chart
    fig.show()

makeChart(df, x='SW120DeathSentences', y='SW120execution', size='SW120Casualties')