In [5]:
import pandas as pd
import os
from tqdm import tqdm
from datetime import datetime

In [6]:
def filtered_dataFrames(in_df):
    final_df_latest = pd.DataFrame()

    in_df['queryStart'] = pd.to_datetime(in_df['queryStart'], format='%Y-%m-%dT%H:%M:%S.%f')
    in_df['start'] = pd.to_datetime(in_df['start'], format='%Y-%m-%dT%H:%M:%S.%f')
    in_df['end'] = pd.to_datetime(in_df['end'], format='%Y-%m-%dT%H:%M:%S.%f')
    # iterowanie po unikalnych aplikacjach
    for app in list(in_df['packageName'].unique()):
        # wybieranie podzbioru dla tej aplikacji
        app_subset = in_df[in_df['packageName'] == app]

        # wybieramy najpóźniejszy wpis danego dnia po queryStart
        filtered_df_latest = app_subset.groupby(app_subset['queryStart'].dt.date).apply(lambda x: x.nlargest(1, columns='queryStart'))
        
        filtered_df_latest.reset_index(drop=True, inplace=True)
        final_df_latest = final_df_latest.append(filtered_df_latest, ignore_index=True)

    final_df_latest.sort_values(by='queryStart', inplace=True)
    final_df_latest.reset_index(drop=True, inplace=True)

    return final_df_latest

In [7]:
def time_diff_more_than_10minutes(data1, data2):
    dt1 = datetime.strptime(data1, '%Y-%m-%dT%H:%M:%S.%f')
    dt2 = datetime.strptime(data2, '%Y-%m-%dT%H:%M:%S.%f')
    
    diff = abs((dt2 - dt1).total_seconds())
    
    return diff > 10*60

In [10]:
def read_aggregate_usageStats_files_for_user(in_path, out_path):
    out_df =  pd.DataFrame()
    for user_folders in os.listdir(in_path):
        if user_folders.find('usageStats') != -1:
            user_files_path = os.path.join(in_path, user_folders)
            for user_file in os.listdir(user_files_path):
                file_path = os.path.join(user_files_path, user_file)  
                data = pd.read_csv(file_path, sep='\t')
                # dla każdego wpisu
                for data_line in data.iloc:
                    # jeżeli jest to wpis cykliczny
                    if time_diff_more_than_10minutes(data_line['queryStart'], data_line['queryEnd']):
                        data_to_append = pd.DataFrame(eval(data_line['usageStats']))
                        data_to_append['queryStart'] = data_line['queryStart']
                        out_df = out_df.append(data_to_append, ignore_index=True)
    if len(out_df) > 0:
        final_df_latest = filtered_dataFrames(out_df)

        # zapisywanie danych do plików
        final_df_latest[['queryStart', 'timeVisible', 'packageName']].to_csv(out_path+'usageStats.csv', index=False)
    else:
        pd.DataFrame(columns=['queryStart', 'timeVisible', 'packageName']).to_csv(out_path+'usageStats.csv', index=False)

In [None]:
for iter in range(1, 8):
    ###########  W MIEJSCU /.../ PODAJ ŚCIEŻKĘ DO DANYCH userData ###########
    iter_path = f"C:/.../i_0{iter}/"
    for i, user_name in tqdm(enumerate(os.listdir(iter_path))):
        in_path = iter_path+user_name

        ########### W MIEJSCU /.../ PODAJ ŚCIEŻKĘ, W KTÓREJ CHCESZ ZAPISAĆ DANE PO PRZETWORZENIU ###########
        out_path = f"C:/.../aggregated_data/i_0{iter}/{user_name}/" 
        
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        read_aggregate_usageStats_files_for_user(in_path, out_path)