# Dependencies

In [1]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import requests
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from PIL import Image

In [2]:
load_dotenv()
data_path = os.getenv('DATA_PATH')
developer_key = os.getenv('YOUTUBE_API')
watch_history = pd.read_json(data_path)

# Preprocessing

In [3]:
# Remove deleted/unavailable videos
watch_history = watch_history[watch_history['subtitles'].notna()]

# Drop Titles with 'Visited Youtube Music'
watch_history = watch_history[~watch_history['title'].str.contains('Visited YouTube Music')]

# Drop titles with Answered survey question
watch_history = watch_history[~watch_history['title'].str.contains('Answered survey question')]

# Drop description
watch_history = watch_history.drop(columns=['description'])

# Drop products
watch_history = watch_history.drop(columns=['products'])

# Drop details
watch_history = watch_history.drop(columns=['details'])

# Drop activityControls
watch_history = watch_history.drop(columns=['activityControls'])

# Drop header
watch_history = watch_history.drop(columns=['header'])


In [4]:
def get_creator_name(lst: list[dict]):
    """Gets the creator name from subtitle, if it exists"""
    if isinstance(lst, list):
        return lst[0]['name']
    else:
        return None
    
def get_youtube_link(lst: list[dict]):
    """Gets the youtube link from subtitle, if it exists"""
    if isinstance(lst, list):
        try:
            return lst[0]['url']
        except:
            return None
    else:
        return None

In [5]:
watch_history['creator'] = watch_history['subtitles'].apply(lambda x: get_creator_name(x))
watch_history['creatorLink'] = watch_history['subtitles'].apply(lambda x: get_youtube_link(x))
watch_history.drop(columns=['subtitles'], inplace=True)

In [6]:
# Convert time column to datetime
watch_history['time'] = pd.to_datetime(watch_history['time'], format='mixed')

In [7]:
watch_history.head()

Unnamed: 0,title,titleUrl,time,creator,creatorLink
0,Watched aren't you a bit YOUNG to be a rollerc...,https://www.youtube.com/watch?v=Ko8uUXcd6L0,2023-07-21 17:56:57.518000+00:00,Ddog,https://www.youtube.com/channel/UC-J3ILdSPjlj8...
1,Watched Almost NOBODY knows about THIS feature...,https://www.youtube.com/watch?v=Mc4vc8vRw8I,2023-07-21 17:56:43.705000+00:00,Cinecom.net,https://www.youtube.com/channel/UCpLfM1_MIcIQ3...
2,Watched Emiru finally confronts Asmongold over...,https://www.youtube.com/watch?v=9e8te8VDJ_E,2023-07-21 17:53:48.096000+00:00,Asmongold Clips,https://www.youtube.com/channel/UCMwJJL5FJFuTR...
3,Watched $1M Invested to Start a Laundromat (Wa...,https://www.youtube.com/watch?v=0aEcWTxnLUI,2023-07-21 17:52:36.349000+00:00,UpFlip,https://www.youtube.com/channel/UChFahjDeMBV67...
4,Watched Redneck Engineering my own CPU Water C...,https://www.youtube.com/watch?v=RGZFb2PlPlo,2023-07-21 17:51:13.197000+00:00,Linus Tech Tips,https://www.youtube.com/channel/UCXuqSBlHAE6Xw...


# Functions

In [8]:
# Most watched creator of all time
def get_most_watched_creator(df):
    """Gets the most viewed creator of all time, returns a dataframe with the creator and the number of times they were watched
    df: Watch history of user that includes creator and time
    """
    # Group by creator

    df = df.groupby(['creator']).count().reset_index()
    # Drop all columns except creator and time
    df = df[['creator', 'time']]
    # Rename time to count
    df = df.rename(columns={'time': 'count'})
    # Sort by count
    df = df.sort_values(by=['count'], ascending=False)
    # Reset index
    df = df.reset_index(drop=True)
    return df


def top_watched_creator_all_time(df, number):
    df = get_most_watched_creator(df)
    return df.head(number)


def top_watched_creator_between(df, start_date, end_date):
    df = df[(df['time'] >= start_date) & (df['time'] <= end_date)]
    df = get_most_watched_creator(df)
    return df


def starting_watch_hours(df):
    """Gets the hours of the day the user starts watching videos"""
    df = df[['time']]
    df['time'] = pd.to_datetime(df['time'])
    df['hour'] = df['time'].dt.hour
    df = df.groupby(['hour']).count().reset_index()
    df = df.rename(columns={'time': 'count'})
    df = df.reset_index(drop=True)
    return df


def starting_watch_hours_between(df, start_date, end_date):
    df = df[(df['time'] >= start_date) & (df['time'] <= end_date)]
    df = starting_watch_hours(df)
    return df


def starting_watch_days(df):
    """Gets the days of the week the user starts watching videos"""
    # copy df
    df = df.copy()
    df = df[['time']]
    df['day'] = df['time'].dt.day_name()
    df = df.groupby(['day']).count().reset_index()
    df = df.rename(columns={'time': 'count'})
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    df['day'] = pd.Categorical(df['day'], categories=day_order, ordered=True)
    df = df.sort_values(by='day')
    df = df.reset_index(drop=True)
    return df


def starting_watch_days_between(df: pd.DataFrame, start_date, end_date):
    df = df[(df['time'] >= start_date) & (df['time'] <= end_date)]
    df = starting_watch_days(df)
    return df

def get_channel_id(channel_link: str):
    """Gets the channel id from a channel link"""
    channel_id = channel_link.split('/')[-1]
    return channel_id

def unique_creators(df: pd.DataFrame):
    """Gets the number of unique creators the user has watched"""
    df = df[['creator', 'creatorLink']]
    df = df.drop_duplicates()
    # Get only channel id from creatorLink
    df['channelId'] = df['creatorLink'].apply(get_channel_id)
    df = df.drop(columns=['creatorLink'])
    df = df.reset_index(drop=True)
    return df

def get_channel_profile_picture(channel_id: str):
    """Gets the channel profile picture from the channel id
    channel_id: The id of the channel
    RETURNS: The url of the channel profile picture"""
    youtube = build('youtube', 'v3', developerKey=developer_key)
    try:
        channel = youtube.channels().list(
            part="snippet",
            id=channel_id
        ).execute()
        return channel['items'][0]['snippet']['thumbnails']['default']['url']
    except HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred:\n{e.content}")

def save_channel_image(channel_id: str):
    """Saves the channel image to the folder
    channel_id: The id of the channel"""
    url = get_channel_profile_picture(channel_id)
    response = requests.get(url)
    file = open(f"channel_images/{channel_id}.jpg", "wb")
    file.write(response.content)
    file.close()

def channel_image_saved(channel_id: str):
    """Checks if the channel image is saved in the folder
    channel_id: The id of the channel
    RETURNS: True if the image is saved, False otherwise"""
    return os.path.exists(f"channel_images/{channel_id}.jpg")


def save_all_channel_images(df: pd.DataFrame):
    """Saves all the channel images in the dataframe
    df: Dataframe with channel ids"""
    for index, row in df.iterrows():
        channel_id = row['channelId']
        if not channel_image_saved(channel_id):
            save_channel_image(channel_id)

def get_dominant_color(image: Image):
    """Gets the most dominant color in the image
    Args:
        image: The image to get the color from
    Returns: hex code of the color"""
    image = image.resize((1,1))
    dominant_color = image.getpixel((0, 0))  # Get the color value of the single pixel
    hex_color = "#{:02x}{:02x}{:02x}".format(*dominant_color)  # Convert RGB to hex format
    return hex_color

def get_channel_images(channel_ids):
    """Gets the channel images from the channel ids in the dataframe
    df: Dataframe with channel ids
    RETURNS: List of channel images"""
    channel_images = []
    for channel_id in channel_ids:
        channel_image = mpimg.imread((f"channel_images/{channel_id}.jpg"))
        channel_images.append(channel_image)
    return channel_images

In [9]:
creators = unique_creators(watch_history)

In [10]:
watch_history = pd.merge(watch_history, creators, left_on='creator', right_on='creator', how='left')

In [26]:
top_10_all_time = top_watched_creator_all_time(watch_history, 10)
# Merge links from creators to top_10_all_time
top_10_all_time = pd.merge(top_10_all_time, creators, left_on='creator', right_on='creator', how='left')

# Reverse Top 10
top_10_all_time = top_10_all_time.iloc[::-1]

In [27]:
save_all_channel_images(top_10_all_time)

In [34]:
def create_graph_with_images(df):
    """Creates a graph with images"""

    save_all_channel_images(df)
    channel_images = get_channel_images(df['channelId'])

    fig, ax = plt.subplots()
    # Create image-based tick labels
    for i, (image, count) in enumerate(zip(channel_images, df['count'])):
        imagebox = OffsetImage(image, zoom=0.3)
        ab = AnnotationBbox(imagebox, (i, 0), frameon=False, box_alignment=(0.5, 1.05))
        ax.add_artist(ab)
        color = get_dominant_color(Image.fromarray(image))
        ax.bar(i, count, color=color)
    # Set tick positions and labels
    ax.set_xticks(range(len(df)))
    ax.set_xticklabels([])  # Hide default tick labels

    # Rotate the image labels if necessary
    plt.xticks(rotation='vertical')

    plt.title('Top 10 Creators of All Time by Clicked Videos')
    plt.show()

In [None]:
create_graph_with_images(top_10_all_time)

In [38]:
def top_watched_between_graph(df, start_date, end_date, number_of_creators=10):
    """Creates a top watched graph in-between two dates and a number of creators"""
    df = top_watched_creator_between(df, start_date, end_date)
    df = df.head(number_of_creators)
    df = pd.merge(df, creators, left_on='creator', right_on='creator', how='left')
    df = df.iloc[::-1]
    create_graph_with_images(df.head(number_of_creators))

In [None]:
top_watched_between_graph(watch_history, '2023-01-01', '2023-12-31')