# Notes

- validate nearest_wo_home via checking residence category
- concatenate with daily home location
- if someone was visiting other peoples homes
- clustering

In [None]:
import pandas as pd
import json 
import numpy as np
import os
import matplotlib.pyplot as plt

In [None]:
# the correspodance between categories and amenities
with open('categories-amenities.json') as f:
    categories = json.load(f)

categories

### Nearest dataframe without home locations

In [None]:
# original dataframe
df = pd.read_csv('../nearest_hundred_final.csv')
df = df.loc[df['dist_mt'] <= 20]
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'])
df['label'] = df['label'].apply(int)
df['label'] = df['label'].apply(str)
df['user'] = df['user'].apply(str)
df['category'] = df['category'].apply(str)
df.head()

In [None]:
# find duration
df['duration'] = df['end'] - df['start']
# convert duration in hours
df['duration'] = df['duration'] / np.timedelta64(1, 's') / 60 / 60
df.head()

In [None]:
new_df = df[["user", "start", "category", "duration" ]]
new_df = new_df.rename(columns={"start": "date"})
new_df['date'] = new_df['date'].dt.date
new_df

### Home locations dataframe

In [None]:
home_df = pd.read_csv('../daily_home_hours.csv')
home_df = home_df.rename(columns={"start_date": "date", "duration_h":"duration"})
home_df['date'] = pd.to_datetime(home_df['date'])
home_df['user'] = home_df['user'].apply(str)
home_df['category'] = 'Residence'
home_df = home_df[["user", "date", "category", "duration" ]]
home_df

### Unite dataframes vertically

In [None]:
final_df = new_df.append(home_df, ignore_index=True)
final_df

In [None]:
# find which day of week is 
final_df['dayofweek'] = final_df['date'].apply(lambda x: x.weekday())
final_df

### Weekend dataframe

In [None]:
# get weekend dataframe
weekend_df = final_df.loc[final_df['dayofweek'].isin([5, 6])]
weekend_df

### Weekday dataframe

In [None]:
# get weekday dataframe
weekday_df = final_df.loc[final_df['dayofweek'].isin([0, 1, 2, 3, 4])]
weekday_df

### Total time per day by user dataframe

In [None]:
df1 = final_df.groupby(['user', 'date', 'category'])['duration'].sum()
df1

### Average daily time per user dataframe

In [None]:
df2 = df1.groupby(['user', 'category']).mean()
df2

### Categories list

In [None]:
categories = df['category'].unique().tolist()
categories

### Distibution of users for each category throughout the month

In [None]:
def distribution(df, title):
    rowCnt = 3
    colCnt = 3     
    subCnt = 1   
    plt.figure(figsize = (25, 20))
    init_df = df.groupby(['user', 'date', 'category'])['duration'].sum()
    init_df =  init_df.groupby(['user', 'category']).mean()
    init_df = init_df.reset_index()
    for cat in categories:
        data = init_df.loc[init_df['category'] == cat]
        arr = data['duration'].to_list()
        plt.subplot(rowCnt, colCnt, subCnt)
        plt.hist(arr, bins = range(0,50,10), histtype='bar')
        plt.title(title + ' ' + str(cat))
        plt.legend([cat])
        subCnt = subCnt + 1
    plt.savefig(title + '.png')
    plt.show()

In [None]:
distribution(final_df, 'Monthly distribution - 20m radius')

### Distibution of users for each category for weekends

In [None]:
distribution(weekend_df, 'Monthly distribution - Weekends - 20m radius')

### Distibution of users for each category for weekdays

In [None]:
distribution(weekday_df, 'Monthly distribution - Weekdays - 20m radius')

### Barplots: average daily time per user

In [None]:
def barplots(df, folder): 
    if not os.path.exists(folder):
        os.makedirs(folder)
    init_df = df.groupby(['user', 'date', 'category'])['duration'].sum()
    init_df =  init_df.groupby(['user', 'category']).mean()
    init_df = init_df.reset_index()
#     print(init_df)
    users = df['user'].unique().tolist()
    for u in users:
        data = init_df.loc[init_df['user'] == u]
        plt.figure(figsize=(15,10))
        plt.title('Average daily time per category for user ' + str(u))
        plt.xlabel('Category')
        plt.ylabel('Hours/day')
        plt.bar(data['category'], data['duration'])
        plt.xticks(rotation = 90) 
        plt.savefig(folder + '/' + str(u) + '.png')
        plt.show()

In [None]:
barplots(final_df, 'Barplots_month - 20m radius')

In [None]:
barplots(weekend_df, 'Barplots_weekend - 20m radius')

In [None]:
barplots(weekday_df, 'Barplots_weekday - 20m radius')

### Boxplots: average daily time per user

In [None]:
def boxplots(df, folder): 
    if not os.path.exists(folder):
        os.makedirs(folder)
    init_df = df.groupby(['user', 'date', 'category'])['duration'].mean()
#     init_df =  init_df.groupby(['user', 'category']).mean()
    init_df = init_df.reset_index()
    users = df['user'].unique().tolist()
    for u in users:
        data = init_df.loc[init_df['user'] == u]
        data.boxplot(column = 'duration', by = 'category',figsize=(15, 10), grid = False)
        plt.title('Boxplot for user ' + str(u))
        plt.suptitle('')
        plt.xlabel('Category')
        plt.ylabel('Hours/day')
        plt.savefig(folder + '/' + str(u) + '.png')
        plt.show()

In [None]:
boxplots(final_df, 'Boxplots_month - 20m radius')

In [None]:
boxplots(weekend_df, 'Boxplots_weekend - 20m radius')

In [None]:
boxplots(weekday_df, 'Boxplots_weekday - 20m radius')

### Average barplot for all users

In [None]:
def average_barplot(df, title): 
    init_df = df.groupby(['user', 'date', 'category'])['duration'].sum()
    init_df = init_df.groupby(['category']).mean()
    init_df = init_df.reset_index()
    print(init_df)
    plt.figure(figsize=(15,10))
    plt.title(title)
    plt.xlabel('Category')
    plt.ylabel('Hours/day')
    plt.bar(init_df['category'], init_df['duration'])
    plt.xticks(rotation = 90) 
    plt.savefig(title + '.png')
    plt.show()

In [None]:
average_barplot(final_df, 'Monthly - average time spent per category - 20m radius')

In [None]:
### weekend dataframe
average_barplot(weekend_df, 'Monthly - Weekends - average time spent per category - 20m radius')

In [None]:
### weekday dataframe
average_barplot(weekday_df, 'Monthly - Weekdays - average time spent per category - 20m radius')

### Average boxplot for all users

In [None]:
tmp =  final_df.groupby(['user', 'date', 'category'])['duration'].sum().reset_index()
tmp.groupby(['date', 'category'])['duration'].mean()

In [None]:
def average_boxplot(df, title): 
    init_df = df.groupby(['user', 'date', 'category'])['duration'].sum().reset_index()
    init_df =  init_df.groupby(['date', 'category'])['duration'].mean().reset_index()
    init_df.boxplot(column = 'duration', by = 'category',figsize=(15, 10), grid = False)
    plt.title(title)
    plt.suptitle('')
    plt.xlabel('Category')
    plt.ylabel('Hours/day')
    plt.savefig(title + '.png')
    plt.show()

In [None]:
average_boxplot(final_df, 'Monthly Boxplot - 20m radius')

In [None]:
### weekend dataframe
average_boxplot(weekend_df, 'Monthly Boxplot - Weekends - 20m radius')

In [None]:
### weekday dataframe
average_boxplot(weekday_df, 'Monthly Boxplot - Weekdays - 20m radius')

# NOT

In [None]:
# # function
# def get_matrix(df, cat_dictionary):
#     # find duration
#     df['duration'] = df['end'] - df['start']
#     # convert duration in seconds
#     df['duration'] = df['duration'] / np.timedelta64(1, 's') / 60 / 60
#     # find the total duration per user per category per day
#     new_df = df[["user", "start", "category", "duration" ]]
#     new_df = new_df.rename(columns={"start": "date"})
#     new_df['date'] = new_df['date'].dt.date
#     df_sum = new_df.groupby(['user','date', 'category']).mean()
#     df_sum.reset_index().to_csv('duration_per_category.csv', index = False)
#     # average duration per category for all users
#     df_avg = new_df.groupby(['date', 'category']).mean()
#     df_avg.reset_index().to_csv('avg_duration_per_category.csv', index = False)
#     return(df_sum, df_avg)

In [None]:
# df_sum, df_avg = get_matrix(df, categories)