In [1]:
import pandas as pd
import re
from datetime import datetime
import requests
import plotly.express as px

def fetch_html_content(url):
    """Fetch the content of an HTML file from a URL."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def extract_data_json(content):
    """Extract JSON data array from the HTML content."""
    pattern = re.compile(r'var datajson =\s*(\[\s*\[.*?\]\s*\]);', re.DOTALL)
    match = pattern.search(content)

    if not match:
        raise ValueError("No data found in the HTML file.")

    return match.group(1)

def convert_js_dates_and_nulls(data_json):
    """Convert JavaScript dates to Python datetime and replace nulls with None."""
    # Replace JavaScript date with Python datetime
    data_json = re.sub(r'new Date\((\d+),(\d+),(\d+)\)',
                       lambda m: f'datetime({m.group(1)},{int(m.group(2))+1},{m.group(3)})',
                       data_json)
    # Replace JavaScript null with Python None
    data_json = data_json.replace('null', 'None')
    return data_json

def evaluate_data_json(data_json):
    """Evaluate the JSON data array to convert it into a Python list."""
    return eval(data_json, {"datetime": datetime, "None": None})

def infer_columns(data):
    """Infer column names based on the first row of data."""
    num_columns = len(data[0])
    # Ensure the first two columns are 'ID' and 'Date'
    columns = ['ID', 'Date'] + [f'Feature_{i+3}' for i in range(num_columns - 2)]
    return columns

def create_dataframe(data):
    """Create a pandas DataFrame from the extracted data."""
    columns = infer_columns(data)
    return pd.DataFrame(data, columns=columns)

def main(url, to_csv):
    content = fetch_html_content(url)
    data_json = extract_data_json(content)
    data_json = convert_js_dates_and_nulls(data_json)
    data = evaluate_data_json(data_json)
    df = create_dataframe(data)

    if to_csv:
        df.to_csv('./extracted_data.csv', index=False)

    return df


In [2]:

# Path to the HTML file
url = 'https://users.nber.org/~dlchen/divMotionChartrollMean2.html'
#file_path = "./divMotionChartrollMean2.html"
# Execute main function
df = main(url = url, to_csv=False) # To save the CSV file


In [3]:
df.columns = ["ID", "Date", "SW120DeathSentences", "SW120execution", "SW120absence","DivType","SW120ExecutionRate", "SW120Casualties"]

print(df.shape)
df.head()

(2482, 8)


Unnamed: 0,ID,Date,SW120DeathSentences,SW120execution,SW120absence,DivType,SW120ExecutionRate,SW120Casualties
0,1,1914-08-26,,,,regular,,
1,1,1914-09-25,,,,regular,,
2,1,1914-10-25,,,,regular,,
3,1,1914-11-24,0.041667,0.0,0.0,regular,0.0,5.525
4,1,1914-12-24,0.041667,0.0,0.0,regular,0.0,11.833333


In [11]:
def makeChart(df, x='SW120DeathSentences', y='SW120execution',size='SW120Casualties'):
  cols2keep = [x,y,size,"Date","DivType","ID"]
  df = df.loc[:,cols2keep]
  print("orig dimensions:", df.shape)
  df = df.dropna()
  print("dimensions after dropping NAs:", df.shape)
  #df.fillna(0, inplace=True)

  # Create the motion chart
  fig = px.scatter(
      df, x=x, y=y, animation_frame='Date', animation_group='ID',
      size=size, color='DivType', hover_name='ID', log_x=False)
  #, size_max=55, range_x=[0, 0.1], range_y=[0, 0.1])

  # Show the chart
  fig.show()

First Chart:
- x-axis: death sentences
- y-axis: Execution Rates
- size: Casualties
- color: Division Type

In [12]:
makeChart(df, x='SW120DeathSentences', y='SW120execution',size='SW120Casualties')

orig dimensions: (2482, 6)
dimensions after dropping NAs: (2341, 6)


## Updated one

In [10]:
import pandas as pd
import re
from datetime import datetime
import requests
import plotly.express as px
import plotly.io as pio

def fetch_html_content(url):
    """Fetch the content of an HTML file from a URL."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def extract_data_json(content):
    """Extract JSON data array from the HTML content."""
    pattern = re.compile(r'var datajson =\s*(\[\s*\[.*?\]\s*\]);', re.DOTALL)
    match = pattern.search(content)

    if not match:
        raise ValueError("No data found in the HTML file.")

    return match.group(1)

def convert_js_dates_and_nulls(data_json):
    """Convert JavaScript dates to Python datetime and replace nulls with None."""
    # Replace JavaScript date with Python datetime
    data_json = re.sub(r'new Date\((\d+),(\d+),(\d+)\)',
                       lambda m: f'datetime({m.group(1)},{int(m.group(2))+1},{m.group(3)})',
                       data_json)
    # Replace JavaScript null with Python None
    data_json = data_json.replace('null', 'None')
    return data_json

def evaluate_data_json(data_json):
    """Evaluate the JSON data array to convert it into a Python list."""
    return eval(data_json, {"datetime": datetime, "None": None})

def infer_columns(data):
    """Infer column names based on the first row of data."""
    num_columns = len(data[0])
    # Ensure the first two columns are 'ID' and 'Date'
    columns = ['ID', 'Date'] + [f'Feature_{i+3}' for i in range(num_columns - 2)]
    return columns

def create_dataframe(data):
    """Create a pandas DataFrame from the extracted data."""
    columns = infer_columns(data)
    return pd.DataFrame(data, columns=columns)

def main(url, to_csv):
    content = fetch_html_content(url)
    data_json = extract_data_json(content)
    data_json = convert_js_dates_and_nulls(data_json)
    data = evaluate_data_json(data_json)
    df = create_dataframe(data)

    if to_csv:
        df.to_csv('./extracted_data.csv', index=False)

    return df

# Path to the HTML file
url = 'https://users.nber.org/~dlchen/divMotionChartrollMean2.html'
#file_path = "./divMotionChartrollMean2.html"
# Execute main function
df = main(url = url, to_csv=False) # To save the CSV file
df.columns = ["ID", "Date", "SW120DeathSentences", "SW120execution", "SW120absence","DivType","SW120ExecutionRate", "SW120Casualties"]

print(df.shape)
df.head()

def makeChart(df, x='SW120DeathSentences', y='SW120execution', size='SW120Casualties'):
    cols2keep = [x, y, size, "Date", "DivType", "ID"]
    df = df.loc[:, cols2keep]
    print("orig dimensions:", df.shape)
    df = df.dropna()
    print("dimensions after dropping NAs:", df.shape)
    #df.fillna(0, inplace=True)

    # Create the motion chart
    fig = px.scatter(
        df, x=x, y=y, animation_frame='Date', animation_group='ID',
        size=size, color='DivType', hover_name='ID', log_x=False)
    #, size_max=55, range_x=[0, 0.1], range_y=[0, 0.1])

    # Save the figure as an HTML file
    pio.write_html(fig, 'plotly_animation.html')

    # Show the chart
    fig.show()

makeChart(df, x='SW120DeathSentences', y='SW120execution', size='SW120Casualties')

(2482, 8)
orig dimensions: (2482, 6)
dimensions after dropping NAs: (2341, 6)
