In [None]:
from datetime import timedelta, datetime
import numpy as np 
import pandas as pd # >= 1.2.4
import seaborn as sns
import matplotlib.pylab as plt 
from matplotlib.dates import MonthLocator 
from matplotlib.dates import DateFormatter

In [None]:
from netflix_tools import Entries

e = Entries("private/ViewingActivity.csv")
df = e.df 
df['startdate'] = [d.date() for d in df['start']]
df['starttime'] = [d.hour for d in df['start']]
df['year'] = [d.year for d in df['start']]
df['month'] = [(d.year, d.month) for d in df['start']]
df['month_only'] = [d.month for d in df['start']]
df['weekday'] = [d.weekday() for d in df['start']]
df['duration_in_minutes'] = df['duration_in_seconds'] / 60
df['duration_in_hours'] = df['duration_in_minutes'] / 60
df['duration_in_days'] = df['duration_in_hours'] / 24


USERS = df['user'].unique()

In [None]:
def plot_grouped(group='user', value='duration_in_days'):
    ax = df.groupby(group,sort=True)[value].sum()\
        .plot.bar(color=sns.color_palette("Spectral_r",7))
    for container in ax.containers:
        ax.bar_label(container, fmt='%.1f')
    if group=='weekday':
        plt.xticks(np.arange(7),["Mon","Tue","Wed","Thu","Fri","Sat","Sun"])
    plt.title(f"Busiest {group}, watch time in days")
    plt.ylabel(value)
    plt.show()

In [None]:
plot_grouped()

In [None]:
plot_grouped('weekday')
plot_grouped('year')
plot_grouped('month_only')

In [None]:
def barplot(xaxis='startdate', yaxis='duration_in_hours', datetime=datetime(2020,1,1), size=(16,5)):
    """
    Plots a stacked barplot of watchtime per netflix user.
    xaxis should be whatever is on the x-axis: [startdate, starttime,year,month,weekday]
    When you plot a long timeseries it is advised to pick a closer-to-now datetime
    """
    df2 = df[df['start'] > datetime] 
    df2 = df2[['user',yaxis,xaxis]]
    df2['sum'] = df2.groupby(['user', xaxis])[yaxis].transform('sum')
    df2 = df2.drop_duplicates(subset=['user', xaxis])
    pivot_df = df2.pivot(index=xaxis, columns='user', values='sum')
    fig, ax = plt.subplots( figsize=size)

    pivot_df.loc[:,USERS].plot(kind='bar', stacked=True, ax=ax, color=
                               sns.color_palette("Spectral_r", len(USERS)),width=0.95)
    # Sadly pandas hates these formatters
    #ax.xaxis.set_major_locator(MonthLocator(interval=3))
    #ax.xaxis.set_major_formatter(DateFormatter('%Y-%m'))
    ax.set_ylabel(yaxis)
    ax.set_xlabel(xaxis)
    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()
# Will plot summed (year,month) data starting from given start year
start_year = 2020
barplot('month', datetime=datetime(start_year,1,1), size=(14,8))

In [None]:
start_year = 2015
barplot('year', datetime=datetime(start_year,1,1), size=(8,5))

In [None]:
def get_show(s):
    test = s.split(':')
    return test[0]
def print_watch_hours(selected_user=None,topk=10):
    df2 = df[df['start'] > datetime(2014,1,1)] 
    df2['real_title'] = [get_show(s) for s in df2['title']]
    df2 = df2[['real_title','user','start','duration_in_hours']]
    
    if selected_user is not None: df2 = df2[df2['user']==selected_user]
    df2['watched_show'] = df2.groupby([ 'real_title'])['duration_in_hours'].transform('sum')
    yes = df2[['real_title','watched_show']].drop_duplicates().sort_values(by='watched_show', ascending=False)

    print("Printing most watched shows for", selected_user, "in hours")
    return yes[:topk]
 
# Will print the 20 most watched shows for the first user
print_watch_hours(topk=20, selected_user=USERS[0])