In [13]:
import pandas as pd
import re
from datetime import datetime

def read_html_file(file_path):
    """Read the content of an HTML file."""
    with open(file_path, 'r') as file:
        return file.read()

def extract_data_json(content):
    """Extract JSON data array from the HTML content."""
    pattern = re.compile(r'var datajson =\s*(\[\s*\[.*?\]\s*\]);', re.DOTALL)
    match = pattern.search(content)

    if not match:
        raise ValueError("No data found in the HTML file.")

    return match.group(1)

def convert_js_dates_and_nulls(data_json):
    """Convert JavaScript dates to Python datetime and replace nulls with None."""
    # Replace JavaScript date with Python datetime
    data_json = re.sub(r'new Date\((\d+),(\d+),(\d+)\)',
                       lambda m: f'datetime({m.group(1)},{int(m.group(2))+1},{m.group(3)})',
                       data_json)
    # Replace JavaScript null with Python None
    data_json = data_json.replace('null', 'None')
    return data_json

def evaluate_data_json(data_json):
    """Evaluate the JSON data array to convert it into a Python list."""
    return eval(data_json, {"datetime": datetime, "None": None})

def infer_columns(data):
    """Infer column names based on the first row of data."""
    num_columns = len(data[0])
    # Ensure the first two columns are 'ID' and 'Date'
    columns = ['ID', 'Date'] + [f'Feature_{i+3}' for i in range(num_columns - 2)]
    return columns

def create_dataframe(data):
    """Create a pandas DataFrame from the extracted data."""
    columns = infer_columns(data)
    return pd.DataFrame(data, columns=columns)

def main(file_path, to_csv):
    content = read_html_file(file_path)
    data_json = extract_data_json(content)
    data_json = convert_js_dates_and_nulls(data_json)
    data = evaluate_data_json(data_json)
    df = create_dataframe(data)

    if to_csv:
      df.to_csv('./extracted_data.csv', index=False)

    return df

# Path to the HTML file
file_path = '/content/divMotionChartrollMean2.html'

# Execute main function
df = main(file_path = file_path, to_csv=True) # To save the CSV file