In [2]:
import pandas as pd
import os
from datetime import datetime, timedelta
from tqdm import tqdm

In [None]:
def aggregate_dataFrames(df_morning, df_evening, userStats, category_to_use, cat_column_name, out_path):
    df_morning_copy = df_morning.copy()
    df_morning_copy[category_to_use] = None
    df_morning_copy[[df_evening.columns]] = None

    df_morning_copy['filledTimestamp'] = df_morning['filledTimestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    df_evening['filledTimestamp'] = df_evening['filledTimestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    userStats['queryStart'] = userStats['queryStart'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
    # zmiana nazwy kolumny
    df_evening.rename(columns={'filledTimestamp':'eveningForm'}, inplace=True)
    
    for index, row in df_morning_copy.iterrows():
        # początek doby przed wypełnieniem ankiety (dla 2023-09-11 07:06:00 będzie to 2023-09-10 00:00:00)
        window_start = df_morning_copy['filledTimestamp'].iloc[index] - timedelta(days=1)
        window_start = window_start.replace(hour=0, minute=0, second=0)
        # koniec doby przed wypełnieniem ankiety (dla 2023-09-11 07:06:00 będzie to 2023-09-11 00:00:00)
        window_end = window_start + timedelta(days=1)

        ######## ANKIETY WIECZORNE ########
        filtered_form = df_evening[(df_evening['eveningForm'] >= window_start) & (df_evening['eveningForm'] <= window_end)]
        for col in filtered_form.columns:
            # jeżeli mamy dwie ankiety (ktoś wypełnił pierwszą po 00:00 dnia poprzedniego), to bierzemy drugą
            if int(len(filtered_form)) == 1:
                df_morning_copy.at[index, col] = filtered_form.iloc[0][col]
            elif int(len(filtered_form) == 2):
                df_morning_copy.at[index, col] = filtered_form.iloc[1][col]
            else:
                df_morning_copy.at[index, col] = None

        ######## UŻYWANIE APLIKACJI ########
        # odfiltrowywanie zbioru danych
        filtered_data_by_time = userStats[(userStats['queryStart'] >= window_start) & (userStats['queryStart'] <= window_end)]
        filtered_data = filtered_data_by_time[filtered_data_by_time[cat_column_name].isin(category_to_use)]
        # agregujemy łączny timeVisible dla wszystkich aplikacji w danej kategorii
        category_time = filtered_data.groupby(cat_column_name)['timeVisible'].sum().reset_index()
        for cat in category_to_use:
            category_df = category_time[category_time[cat_column_name] == cat]
            if len(category_df) > 0:
                df_morning_copy.at[index, cat] = category_time[category_time[cat_column_name] == cat]['timeVisible'].iloc[0]
            else:
                df_morning_copy.at[index, cat] = 0

    df_morning_copy.to_csv(out_path, index=False)

In [None]:
##########  ZMIEŃ WARTOŚCI PARAMETRÓW ###########
column_name = "category" # nazwa kolumny z rodzajem kategorii aplikacji (do wyboru: "category" lub "agg_category")
category_to_use = ['Tools', 'Lifestyle', 'Casual', 'Books & Reference', 'Travel & Local',
 'System', 'Productivity', 'Health & Fitness', 'Food & Drink', 'Education',
 'Puzzle', 'Music & Audio', 'Medical', 'Business', 'Weather', 'Strategy',
 'Sports', 'Shopping', 'Finance', 'Simulation', 'Entertainment',
 'News & Magazines','Board', 'Communication', 'Photography', 'Parenting',
 'Role Playing', 'Video Players & Editors', 'Maps & Navigation', 'Racing',
 'Beauty', 'Social', 'Personalization', 'Action', 'Art & Design',
 'Auto & Vehicles', 'Trivia', 'Word', 'Dating', 'Events', 'Adventure', 'Card',
 'Casino', 'House & Home', 'Comics'] # lista kategorii, które mają zostać dodane do zbioru danych
# (w zależności od parametru column_name lista powinna zostać odpowiednio dobrana)
out_filename = "agg_minute_data_test.csv" # nazwa pliku wyjściowego zapisywana w folderze dla każdego uczestnika

category_df = pd.read_csv("./dataset/apps_category.csv")
for iter in range(1, 8):
    ##########  W MIEJSCU /.../ PODAJ ŚCIEŻKĘ DO DANYCH PO PRZETWORZENIU ###########
    iter_path = f"C:/.../aggregated_data/i_0{iter}/"
    for i, user_name in tqdm(enumerate(os.listdir(iter_path))):
        user_path = iter_path+user_name

        df_morning = pd.read_csv(user_path+'/morning_forms.csv')
        df_evening = pd.read_csv(user_path+'/evening_forms.csv')

        usageStats = pd.read_csv(user_path+'usageStats.csv')
        usageStats = usageStats.merge(category_df, on='packageName')
        
        aggregate_dataFrames(df_morning, df_evening, usageStats, category_to_use=category_to_use, cat_column_name=column_name, out_path=user_path+out_filename)