In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

In [None]:
# Load the data for my Netflix activity
pwd = %pwd
netflix_raw = pd.read_csv(pwd + '/datasets/own/netflix_viewing_activity.csv')

In [None]:
# Check raw data
netflix_raw.head()

In [None]:
# Check the data types
netflix_raw.info()

In [None]:
# Create column with year & month
netflix_raw['Year Month'] = pd.to_datetime(netflix_raw['Start Time']).dt.to_period('M')

In [None]:
# Create column for duration in minutes
netflix_raw['Duration (min)'] = netflix_raw['Duration'].str.split(':').apply(
    lambda x: float(x[0]) * 60.0 + float(x[1]) + float(x[2]) / 60.0)

In [None]:
# Select data for only 2020
netflix_2020 = netflix_raw[netflix_raw['Year Month'].dt.year == 2020]

# Select data for only 2019
netflix_2019 = netflix_raw[netflix_raw['Year Month'].dt.year == 2019]

In [None]:
# Sum duration per month for 2020
netflix_2020_sum = netflix_2020.groupby(['Year Month']).sum()
netflix_2020_sum['Duration (hours)'] = netflix_2020_sum['Duration (min)']/60.0
netflix_2020_sum['Duration (days)'] = netflix_2020_sum['Duration (hours)']/24.0

In [None]:
# Sum duration per month for 2019
netflix_2019_sum = netflix_2019.groupby(['Year Month']).sum()
netflix_2019_sum['Duration (hours)'] = netflix_2019_sum['Duration (min)']/60.0
netflix_2019_sum['Duration (days)'] = netflix_2019_sum['Duration (hours)']/24.0

In [None]:
# Add missing months with 0 for Duration fro 2019
idx = pd.period_range(min(netflix_2019_sum.index), max(netflix_2019_sum.index))
netflix_2019_sum = netflix_2019_sum.reindex(idx, fill_value=0)

In [None]:
# Plot for 2020
fig, ax = plt.subplots()
fig.set_size_inches((16,6))

# Create pallete for the barplot based on y-values
pal = sns.color_palette("Greens_d", len(netflix_2020_sum))
rank = netflix_2020_sum['Duration (hours)'].argsort().argsort()

# Use seaborn to draw a barplot
sns.barplot(x=netflix_2020_sum.index,y='Duration (hours)',data=netflix_2020_sum,ax=ax, palette=np.array(pal[::-1])[rank])

# Set the title
ax.set_title('Netflix Activity 2020', fontsize=24)

# Set the y limit
ax.set_ylim([0, 80])

# Add a horizontal line with the duration avg in hours
ax.axhline(netflix_2020_sum['Duration (hours)'].mean(), ls='--', color = 'red')
ax.text(0.3, 60,'AVG = ' + str(int(netflix_2020_sum['Duration (hours)'].mean())) + ' (hours)', 
        fontsize=18, color = 'red')
plt.show()

In [None]:
# Plot for 2019
fig, ax = plt.subplots()
fig.set_size_inches((16,6))

# Create pallete for the barplot based on y-values
pal = sns.color_palette("Greens_d", len(netflix_2019_sum))
rank = netflix_2019_sum['Duration (hours)'].argsort().argsort()

# Use seaborn to draw a barplot
sns.barplot(x=netflix_2019_sum.index,y='Duration (hours)',data=netflix_2019_sum,ax=ax, palette=np.array(pal[::-1])[rank])

# Set the title
ax.set_title('Netflix Activity 2019', fontsize=24)

# Set the y limit
ax.set_ylim([0, 80])

# Add a horizontal line with the duration avg in hours
ax.axhline(netflix_2019_sum['Duration (hours)'].mean(), ls='--', color = 'red')
ax.text(0.3, 60,'AVG = ' + str(int(netflix_2019_sum['Duration (hours)'].mean())) + ' (hours)', 
        fontsize=18, color = 'red')
plt.show()