In [None]:
# Install necessary libraries
!pip install pandas google-api-python-client isodate tqdm seaborn matplotlib scipy numpy

In [None]:
import pandas as pd
import json

FILE_NAME = 'watch-history.json'
video_data = []

try:
    with open(FILE_NAME, 'r', encoding='utf-8') as f:
        history = json.load(f)
except Exception as e:
    # Set history to empty list on failure
    history = []

for item in history:
    # Filter only for video watch records with a valid YouTube watch URL
    if 'titleUrl' in item and 'time' in item and 'youtube.com/watch?v=' in item.get('titleUrl', ''):
        try:
            # Extract video ID from the URL
            video_id = item['titleUrl'].split('v=')[-1].split('&')[0]

            # Collect essential data
            video_data.append({
                'video_ID': video_id,
                'watch_timestamp': item['time'],
                'video_title': item['title'].replace('Watched ', '').strip()
            })
        except Exception:
            # Skip records with faulty/missing data format
            continue

df_takeout = pd.DataFrame(video_data)
# Output only the size of the dataframe
print(len(df_takeout))
# Output the first 5 records
print(df_takeout.head())

In [None]:
import pandas as pd
from googleapiclient.discovery import build
import time
from tqdm.notebook import tqdm
import os # Import os module to access environment variables

# --- API KEY HANDLING: Securely retrieve the key from environment variables ---
API_KEY = os.environ.get("YOUTUBE_API_KEY") 

if not API_KEY:
    # If the key is not set in the environment, skip API calls.
    df_api = pd.DataFrame()
    # Ensure df_merged is created so subsequent cells don't fail
    df_merged = df_takeout.copy() 
    print("API_KEY not found in environment variables (YOUTUBE_API_KEY). Skipping API data fetch.")
    print(len(df_merged))
    print(df_merged.head())
    
else:
    # YouTube API client initialization
    youtube = build('youtube', 'v3', developerKey=API_KEY)
    
    unique_video_ids = df_takeout['video_ID'].unique()
    print(len(unique_video_ids))

    api_results = []
    chunk_size = 50 

    # Query IDs in chunks and display progress
    for i in tqdm(range(0, len(unique_video_ids), chunk_size), desc="API Query Progress"):
        chunk = unique_video_ids[i:i + chunk_size]

        try:
            request = youtube.videos().list(
                part="snippet,contentDetails,statistics",
                id=','.join(chunk)
            )
            response = request.execute()

            for item in response.get('items', []):
                stats = item.get('statistics', {})
                content = item.get('contentDetails', {})
                snippet = item.get('snippet', {})

                api_results.append({
                    'video_ID': item['id'],
                    'categoryName': snippet.get('categoryId'),
                    'viewCount': int(stats.get('viewCount', 0)),
                    'likeCount': int(stats.get('likeCount', 0)),
                    'duration_raw': content.get('duration')
                })

        except Exception as e:
            # Skip all error/quota warnings
            break

        time.sleep(0.1)

    df_api = pd.DataFrame(api_results)

    # Map YouTube category IDs to descriptive names
    category_map = {
        "1": "Film & Animation", "2": "Autos & Vehicles", "10": "Music", "15": "Pets & Animals",
        "17": "Sports", "19": "Travel & Events", "20": "Gaming", "22": "People & Blogs",
        "23": "Comedy", "24": "Entertainment", "25": "News & Politics", "26": "Howto & Style",
        "27": "Education", "28": "Science & Technology", "29": "Nonprofits & Activism",
        "30": "Movies", "43": "Shows"
    }
    df_api['categoryName'] = df_api['categoryName'].astype(str).map(category_map).fillna('Other')

    # Merge the two dataframes
    df_merged = pd.merge(df_takeout, df_api, on='video_ID', how='inner')

    # Output only the final record count and the first 5 records
    print(len(df_merged))
    print(df_merged.head())

In [None]:
import numpy as np
import isodate
import pandas as pd


# Convert watch timestamp to datetime object with correct UTC and format handling
df_merged['watch_timestamp'] = pd.to_datetime(df_merged['watch_timestamp'], utc=True, format='mixed')

# Extract time-based features
df_merged['hour_of_day'] = df_merged['watch_timestamp'].dt.hour
df_merged['day_of_week'] = df_merged['watch_timestamp'].dt.day_name()
df_merged['is_weekend'] = df_merged['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)

# Convert days to an ordered categorical type (useful for plotting)
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_merged['day_of_week'] = pd.Categorical(df_merged['day_of_week'], categories=day_order, ordered=True)

def convert_iso_to_seconds(iso_duration):
    """Helper function to convert ISO 8601 duration to seconds."""
    try:
        return isodate.parse_duration(iso_duration).total_seconds()
    except:
        return 0

# Convert video durations to seconds. Add safety check for missing API columns.
if 'duration_raw' in df_merged.columns:
    df_merged['duration_seconds'] = df_merged['duration_raw'].apply(convert_iso_to_seconds)
else:
    df_merged['duration_seconds'] = 0 

# Apply log transformation to view and like counts. Add safety check for missing API columns.
if 'viewCount' in df_merged.columns:
    df_merged['log_viewCount'] = np.log1p(df_merged['viewCount'])
    df_merged['log_likeCount'] = np.log1p(df_merged['likeCount'])
else:
    df_merged['log_viewCount'] = np.nan
    df_merged['log_likeCount'] = np.nan

# Create final clean dataframe by dropping rows with missing essential data
df_final = df_merged.dropna(subset=['log_viewCount', 'categoryName', 'hour_of_day']).copy()

# Output only the descriptive statistics
print(df_final[['hour_of_day', 'day_of_week', 'log_viewCount', 'duration_seconds']].describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

sns.set(style="whitegrid", font_scale=1.1)

if not df_final.empty:
    # Bar Plot: Total Watch Count by Category
    plt.figure(figsize=(12, 6))
    category_counts_plot = df_final['categoryName'].value_counts().sort_values(ascending=False)
    sns.barplot(x=category_counts_plot.index, y=category_counts_plot.values, palette="viridis")
    plt.title('Total Watch Count by Video Category')
    plt.xlabel('Video Category')
    plt.ylabel('Watch Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("Skipping Category Count Visualization: Dataframe is empty.")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

sns.set(style="whitegrid", font_scale=1.1)

if not df_final.empty:
    # Bar Plot: Watch Count by Hour of Day
    plt.figure(figsize=(10, 5))
    hour_counts = df_final['hour_of_day'].value_counts().sort_index()
    sns.barplot(x=hour_counts.index, y=hour_counts.values, color='skyblue')
    plt.title('Viewing Frequency by Hour of Day')
    plt.xlabel('Hour of Day (24h)')
    plt.ylabel('Watch Count')
    plt.show()
else:
    print("Skipping Hour Count Visualization: Dataframe is empty.")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, f_oneway, pearsonr
import pandas as pd

sns.set(style="whitegrid", font_scale=1.1)

# Check if df_final is empty (which happens if API fetch was skipped and dropped NaN rows)
if df_final.empty:
    print("Warning: df_final is empty (likely due to missing API data). Skipping visualizations and hypothesis tests.")
    
else:
    # Heatmap: Viewing Density (Personal Consumption Pattern)
    plt.figure(figsize=(12, 8))
    # The observed=False prevents the removal of unused categories, keeping the structure consistent.
    heatmap_data = df_final.groupby(['day_of_week', 'hour_of_day'], observed=False).size().unstack(fill_value=0)
    sns.heatmap(heatmap_data, cmap="YlGnBu", linewidths=.5, cbar_kws={'label': 'Watch Count'})
    plt.title('Viewing Density by Day of Week and Hour (Personal Patterns)')
    plt.xlabel('Hour of Day')
    plt.ylabel('Day of Week')
    plt.show()


    # Box Plot: View Count Distribution for Top Categories
    top_categories = df_final['categoryName'].value_counts().head(7).index
    df_top = df_final[df_final['categoryName'].isin(top_categories)]

    plt.figure(figsize=(12, 6))
    # The 'y' variable is passed as 'hue' to suppress the deprecation warning without changing the plot logic
    sns.boxplot(x='log_viewCount', y='categoryName', data=df_top, palette="Pastel1", hue='categoryName', legend=False)
    plt.title('Distribution of Log(View Count) in Top Categories')
    plt.xlabel('Log(1 + View Count)')
    plt.ylabel('Video Category')
    plt.show()

    # Filter categories with at least 10 views for robust hypothesis testing
    category_counts = df_final['categoryName'].value_counts()
    valid_categories = category_counts[category_counts >= 10].index
    df_test = df_final[df_final['categoryName'].isin(valid_categories)].copy()

    # --- HYPOTHESIS TEST BLOCK 1: Chi-Square (Time vs Category) ---
    print("\n--- HYPOTHESIS TEST 1: Time vs Category ---")
    contingency_table = pd.crosstab(df_test['day_of_week'], df_test['categoryName'])
    
    # Need to ensure the table isn't trivial before Chi-Square
    if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
        chi2, p_chi2, dof, expected = chi2_contingency(contingency_table)
        print(f"P-Value: {p_chi2:.10f}")
        print("Conclusion: " + ("Relationship exists (p < 0.05)." if p_chi2 < 0.05 else "No relationship found (p >= 0.05)."))
    else:
        print("Test Skipped: Insufficient data for Chi-Square test.")


    # --- HYPOTHESIS TEST BLOCK 2: ANOVA (Category Popularity Difference) ---
    print("\n--- HYPOTHESIS TEST 2: Category Popularity ---")
    anova_groups = [df_test['log_viewCount'][df_test['categoryName'] == cat].values
                    for cat in valid_categories if len(df_test['log_viewCount'][df_test['categoryName'] == cat]) > 0]
    
    # Need to ensure we have at least two groups for ANOVA
    if len(anova_groups) >= 2:
        f_stat, p_anova = f_oneway(*anova_groups)
        print(f"P-Value: {p_anova:.10f}")
        print("Conclusion: " + ("Significant difference exists (p < 0.05)." if p_anova < 0.05 else "No significant difference found (p >= 0.05)."))
    else:
        print("Test Skipped: Less than two valid categories for ANOVA test.")


    # --- HYPOTHESIS TEST BLOCK 3: Pearson Correlation (Time vs Popularity) ---
    print("\n--- HYPOTHESIS TEST 3: Time vs Popularity ---")
    # Check for sufficient data points
    if len(df_test) > 1:
        corr, p_corr = pearsonr(df_test['hour_of_day'], df_test['log_viewCount'])
        print(f"Correlation Coefficient (r): {corr:.4f}")
        print(f"P-Value: {p_corr:.10f}")
        print("Conclusion: " + ("Relationship exists (p < 0.05)." if p_corr < 0.05 else "No relationship found (p >= 0.05)."))
    else:
        print("Test Skipped: Insufficient data for Pearson correlation test.")