In [1]:
import json
import time
from googleapiclient.discovery import build
import re
from datetime import datetime
from itertools import islice
import pandas as pd

### Parsing the Video-IDs of my Watch History into an Array

In [None]:
with open('watch-history.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

watch_hist_ids = []  #List to store video IDs and watched times
watched_time_map = {}  #Dictionary to map video ID to its watched time

for item in data:
    if 'titleUrl' in item:
        url = item['titleUrl']
        
        if 'v=' in url:
            watch_hist_id = url.split('v=')[1].split('&')[0]  #Extract video ID
            watched_time = item.get('time')  #Extract watched timestamp
            
            if watch_hist_id and watched_time:
                watch_hist_ids.append(watch_hist_id)  #Store video ID
                watched_time_map[watch_hist_id] = watched_time  #Map video ID to watched time

print(f"Extracted {len(watch_hist_ids)} video IDs.")

Extracted 32525 video IDs.


### Retrieving Meta-Data of the Videos Watched:
*Category ID*, 
*Watch Date*, 
*Duration Time*

In [None]:

# Initialize the YouTube API client
API_KEY = 'AIzaSyA7MfRXQKG53Zn9JAnwSE-yC7KL044j4us'
youtube = build('youtube', 'v3', developerKey=API_KEY)
def parse_duration(duration_str):
    pattern = r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?"
    match = re.match(pattern, duration_str)
    
    hours = int(match.group(1) or 0)
    minutes = int(match.group(2) or 0)
    seconds = int(match.group(3) or 0)
    
    total_seconds = hours * 3600 + minutes * 60 + seconds
    return total_seconds


def fetch_video_details(ids,map):
    all_hist_data = []  # List to store metadata
    batch_size = 50  # Maximum batch size allowed by API
    total_batches = (len(ids) + batch_size - 1) // batch_size  # Total number of batches

    for i in range(0, len(ids), batch_size):
        batch = ids[i:i + batch_size]
        try:
            # Fetch details for the current batch
            request = youtube.videos().list(part='snippet,contentDetails', id=','.join(batch))
            response = request.execute()

            # Extract relevant metadata
            for item in response['items']:
                video_id = item['id']
                duration =parse_duration(item['contentDetails']['duration'])
                video_data = {
                    'categoryId': item['snippet']['categoryId'],
                    'watched_at': map.get(video_id, 'Unknown'),  # Add watched time from map
                    'durationv':duration
                }
                all_hist_data.append(video_data)

            print(f"Batch {i // batch_size + 1}/{total_batches} fetched successfully.")
            time.sleep(1)  # Pause to avoid hitting rate limits

        except Exception as e:
            print(f"Error fetching batch {i // batch_size + 1}: {e}")
            time.sleep(5)  # Pause before retrying

    return all_hist_data

#### Putting the Fetched Data into a JSON File for Further Analysis

In [None]:

video_metadata = fetch_video_details(watch_hist_ids,watched_time_map)
with open('video_metadata.json', 'w', encoding='utf-8') as f:
    json.dump(video_metadata, f, ensure_ascii=False, indent=4)

print(f"Fetched data for {len(video_metadata)} videos.")

### Creating JSON Files for Each Year and Putting Related Data into the JSON Files for Better Analysis

In [None]:
# Load the JSON data
with open("video_metadata.json", "r") as file:
    data = json.load(file)

# Parse data into a pandas DataFrame
df = pd.DataFrame(data)

# Convert 'watched_at' to datetime
df['watched_at'] = df['watched_at'].apply(parse_date)
df['watched_at'] = pd.to_datetime(df['watched_at'], errors='coerce')

# Extract year and month for grouping
df['year'] = df['watched_at'].dt.year
df['month'] = df['watched_at'].dt.month
df['count'] = 1

# Group by year, month, and category to calculate total duration
grouped = (
    df.groupby(['year', 'month', 'categoryId'])
    .agg(
        total_duration=('durationv', 'sum'),  # Sum of video durations
        category_count=('count', 'sum')  # Count of videos per category
    )
    .reset_index()
)

# Convert total duration to minutes
grouped['total_duration'] = grouped['total_duration'] / 60.0

# Prepare the yearly data
for year in df['year'].dropna().unique():
    yearly_data = []
    for month in range(1, 13):  # Iterate over all months
        month_data = grouped[(grouped['year'] == year) & (grouped['month'] == month)]
        if not month_data.empty:
            # Create the dictionaries for category counts and category durations
            categories_count = month_data[['categoryId', 'category_count']].set_index('categoryId').to_dict()['category_count']
            categories_duration = month_data[['categoryId', 'total_duration']].set_index('categoryId').to_dict()['total_duration']
            
            # Prepare the month summary
            month_summary = {
                "month": month,
                "total_duration_minutes": float(month_data['total_duration'].sum()),  # Total duration in minutes
                "categories_count": {str(k): int(v) for k, v in categories_count.items()},  # Category counts
                "categories_duration_minutes": {str(k): float(v) for k, v in categories_duration.items()}  # Duration per category in minutes
            }
            yearly_data.append(month_summary)
    
    # Write to a yearly JSON file
    with open(f"video_metadata{int(year)}.json", "w") as year_file:
        json.dump(yearly_data, year_file, indent=4)