In [None]:
import pandas as pd
import json 
import numpy as np
import os
import matplotlib.pyplot as plt

In [None]:
# the correspodance between categories and amenities
with open('categories-amenities.json') as f:
    categories = json.load(f)

categories

In [None]:
# original dataframe
df = pd.read_csv('../nearest_hundred_final.csv')
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'])
df['label'] = df['label'].apply(int)
df['label'] = df['label'].apply(str)
df['user'] = df['user'].apply(str)
df['category'] = df['category'].apply(str)
df.head()

In [None]:
# find duration
df['duration'] = df['end'] - df['start']
# convert duration in hours
df['duration'] = df['duration'] / np.timedelta64(1, 's') / 60 / 60
df.head()

In [None]:
new_df = df[["user", "start", "category", "duration" ]]
new_df = new_df.rename(columns={"start": "date"})
new_df['date'] = new_df['date'].dt.date
new_df

In [None]:
# find which day of week is 
new_df['dayofweek'] = new_df['date'].apply(lambda x: x.weekday())
new_df

In [None]:
# get weekend dataframe
weekend_df = new_df.loc[new_df['dayofweek'].isin([5, 6])]
weekend_df

In [None]:
# get weekday dataframe
weekday_df = new_df.loc[new_df['dayofweek'].isin([0, 1, 2, 3, 4])]
weekday_df

### Total time per day by user dataframe

In [None]:
df1 = new_df.groupby(['user', 'date', 'category'])['duration'].sum()
df1

### Average daily time per user dataframe

In [None]:
df2 = df1.groupby(['user', 'category']).mean()
df2

In [None]:
categories = df['category'].unique().tolist()
categories

### Distibution of users for each category throughout the month

In [None]:
def distribution(df, title):
    rowCnt = 3
    colCnt = 3     
    subCnt = 1   
    plt.figure(figsize = (25, 20))
    init_df = df.groupby(['user', 'date', 'category'])['duration'].sum()
    init_df =  init_df.groupby(['user', 'category']).mean()
    init_df = init_df.reset_index()
    for cat in categories:
        data = init_df.loc[init_df['category'] == cat]
        arr = data['duration'].to_list()
        plt.subplot(rowCnt, colCnt, subCnt)
        plt.hist(arr, bins = range(0,50,10), histtype='bar')
        plt.title(title + ' ' + str(cat))
        plt.legend([cat])
        subCnt = subCnt + 1
    plt.savefig(title + '.png')
    plt.show()

In [None]:
distribution(new_df, 'Monthly distribution')

In [None]:
distribution(weekend_df, 'Monthly distribution - Weekends')

In [None]:
distribution(weekday_df, 'Monthly distribution - Weekdays')

### Barplots: average daily time per user

In [None]:
def barplots(df, folder): 
    if not os.path.exists(folder):
        os.makedirs(folder)
    init_df = df.groupby(['user', 'date', 'category'])['duration'].sum()
    init_df =  init_df.groupby(['user', 'category']).mean()
    init_df = init_df.reset_index()
#     print(init_df)
    users = df['user'].unique().tolist()
    for u in users:
        data = init_df.loc[init_df['user'] == u]
        plt.figure(figsize=(15,10))
        plt.title('Average daily time per category for user ' + str(u))
        plt.xlabel('Category')
        plt.ylabel('Hours/day')
        plt.bar(data['category'], data['duration'])
        plt.xticks(rotation = 90) 
        plt.savefig(folder + '/' + str(u) + '.png')
        plt.show()

In [None]:
barplots(new_df, 'Barplots_month')

In [None]:
barplots(weekend_df, 'Barplots_weekend')

In [None]:
barplots(weekday_df, 'Barplots_weekday')

### Boxplots: average daily time per user

In [None]:
def boxplots(df, folder): 
    if not os.path.exists(folder):
        os.makedirs(folder)
    init_df = df.groupby(['user', 'date', 'category'])['duration'].mean()
#     init_df =  init_df.groupby(['user', 'category']).mean()
    init_df = init_df.reset_index()
    users = df['user'].unique().tolist()
    for u in users:
        data = init_df.loc[init_df['user'] == u]
        data.boxplot(column = 'duration', by = 'category',figsize=(15, 10), grid = False)
        plt.title('Boxplot for user ' + str(u))
        plt.suptitle('')
        plt.xlabel('Category')
        plt.ylabel('Hours/day')
        plt.savefig(folder + '/' + str(u) + '.png')
        plt.show()

In [None]:
boxplots(new_df, 'Boxplots_month')

In [None]:
boxplots(weekend_df, 'Boxplots_weekend')

In [None]:
boxplots(weekday_df, 'Boxplots_weekday')

### Average barplot for all users

In [None]:
def average_barplot(df, title): 
    init_df = df.groupby(['user', 'date', 'category'])['duration'].sum()
    init_df = init_df.groupby(['category']).mean()
    init_df = init_df.reset_index()
    print(init_df)
    plt.figure(figsize=(15,10))
    plt.title(title)
    plt.xlabel('Category')
    plt.ylabel('Hours/day')
    plt.bar(init_df['category'], init_df['duration'])
    plt.xticks(rotation = 90) 
    plt.savefig(title + '.png')
    plt.show()

In [None]:
average_barplot(new_df, 'Monthly - average time spent per category')

In [None]:
### weekend dataframe
average_barplot(weekend_df, 'Monthly - Weekends - average time spent per category')

In [None]:
### weekday dataframe
average_barplot(weekday_df, 'Monthly - Weekdays - average time spent per category')

### Average boxplot for all users

In [None]:
def average_boxplot(df, title): 
    init_df = df.groupby(['user', 'date', 'category'])['duration'].mean()
    init_df =  init_df.groupby(['category', 'date']).mean()
    init_df = init_df.reset_index()
    init_df.boxplot(column = 'duration', by = 'category',figsize=(15, 10), grid = False)
    plt.title(title)
    plt.suptitle('')
    plt.xlabel('Category')
    plt.ylabel('Hours/day')
    plt.savefig(title + '.png')
    plt.show()

In [None]:
average_boxplot(new_df, 'Monthly Boxplot')

In [None]:
### weekend dataframe
average_boxplot(weekend_df, 'Monthly Boxplot - Weekends')

In [None]:
### weekday dataframe
average_boxplot(weekday_df, 'Monthly Boxplot - Weekdays')