
# Formatting the Data




In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

# Load the HTML file
html_file = "history/watch-history.html"
if not html_file:
    print("Error: No file path provided.")
    exit(1)

try:
    with open(html_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
except FileNotFoundError:
    print(f"Error: File not found at {html_file}")
    exit(1)

# Initialize an empty list to store watch data
watch_data = []

# Find all "content-cell" divs which contain the necessary data
content_cells = soup.find_all("div", class_="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1")

# Check if content cells were found
if not content_cells:
    print("Error: No relevant content cells found in the HTML.")
    exit(1)
else:
    print(f"Found {len(content_cells)} content cells.")

# Iterate through the content cells and extract data
for i, cell in enumerate(content_cells, 1):
    try:
        # Extract video title and URL
        title_tag = cell.find("a")
        if title_tag:
            title = title_tag.text
            url = title_tag["href"]
        else:
            title = None
            url = None
        print(f"[{i}] Title: {title if title else 'No title found'}")

        # Extract channel name (second <a> tag)
        channel_tag = cell.find_all("a")
        if len(channel_tag) > 1:
            channel = channel_tag[1].text
        else:
            channel = None
        print(f"[{i}] Channel: {channel if channel else 'No channel found'}")

        # Extract timestamp (last line of text in the cell)
        raw_timestamp = cell.get_text().split("\n")[-1].strip()

        # Parse the timestamp
        try:
            timestamp = datetime.strptime(raw_timestamp, "%b %d, %Y, %I:%M:%S %p %Z")
            date = timestamp.date()
            day_of_week = timestamp.strftime("%A")  # Full name of the day
            time = timestamp.strftime("%I:%M %p")  # Hour:Minute AM/PM
        except ValueError:
            print(f"[{i}] Error parsing timestamp: {raw_timestamp}")
            date = None
            day_of_week = None
            time = None

        # Append the data to the list
        watch_data.append({
            "Title": title,
            "URL": url,
            "Channel": channel,
            "Date": date,
            "Day of Week": day_of_week,
            "Time": time
        })

    except Exception as e:
        print(f"Error processing content cell [{i}]: {e}")

# Convert the list to a DataFrame
try:
    df = pd.DataFrame(watch_data)
    if df.empty:
        print("Error: No data extracted, DataFrame is empty.")
        exit(1)
    else:
        print("Data successfully extracted and converted to DataFrame.")
except Exception as e:
    print(f"Error converting to DataFrame: {e}")
    exit(1)

# Save the DataFrame to CSV
output_file = "youtube_watch_history_parsed.csv"
try:
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Data has been saved to '{output_file}'")
except Exception as e:
    print(f"Error saving to CSV: {e}")


KeyboardInterrupt: 