In [3]:
import pandas as pd

# Load user log data
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds()

# Create a mask for NaN values in time_spent
mask = user_log['time_spent'].isna()

# Fill NaN values with 0 temporarily
user_log.loc[mask, 'time_spent'] = 0

# Calculate average time spent per page for each user excluding records where time_spent was initially NaN
average_time_spent = user_log[~mask].groupby('user_id')['time_spent'].mean()

# Replace 0 values in time_spent with average time spent per user
user_log.loc[mask, 'time_spent'] = user_log.loc[mask, 'user_id'].map(average_time_spent)

# Drop unnecessary columns
user_log = user_log.drop(columns=['next_timestamp'])

# Display processed user log data
print(user_log.head())


   user_id                  page_name           timestamp  time_spent
0        1             Cardiac_muscle 2023-07-06 08:00:00      1641.0
1        1                       Anus 2023-07-06 08:27:21       679.0
2        1         Pulmonary_alveolus 2023-07-06 08:38:40      1347.0
3        1    Loose_connective_tissue 2023-07-06 09:01:07       615.0
4        1  Peripheral_nervous_system 2023-07-06 09:11:22      1070.5


In [4]:
import pandas as pd

# Load user log data (assuming 'user_log.csv' exists in your working directory)
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed (assuming they are already named correctly)
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds().fillna(round(average_time_spent/2))

# Group by user_id and aggregate paths and time_spent
grouped = user_log.groupby('user_id').agg({
    'page_name': lambda x: list(x),
    'time_spent': lambda x: list(x)
}).reset_index()

# Format into the desired output format
for index, row in grouped.iterrows():
    user_id = row['user_id']
    paths = row['page_name']
    time_spents = row['time_spent']
    

counter = 0
for index, row in grouped.iterrows():
    user_id = row['user_id']
    paths = row['page_name']
    time_spents = row['time_spent']
    print(f"{user_id}, {paths}, {time_spents}")
    counter += 1
    if counter >= 5:
        break


1, ['Cardiac_muscle', 'Anus', 'Pulmonary_alveolus', 'Loose_connective_tissue', 'Peripheral_nervous_system'], [1641.0, 679.0, 1347.0, 615.0, 828.0]
2, ['Skeletal_animation', 'Merkel_cell', 'Duodenum', 'Brainstem', 'Lymph_node'], [2383.0, 1425.0, 3125.0, 2400.0, 950.0]
3, ['Mammary_gland', 'Mucous_membrane', 'Tonsil', 'Apoptosis', 'Thyroid'], [285.0, 1278.0, 3327.0, 3210.0, 813.0]
4, ['Atrium_(heart)', 'Urinary_cast', 'Urinary_retention', 'Axon_terminal', 'Protein'], [808.0, 162.0, 2276.0, 3379.0, 648.0]
5, ['Ceruminous_gland', 'Paranasal_sinuses', 'Myelin', 'Mouth', 'Atrium_(heart)'], [2331.0, 2755.0, 210.0, 2345.0, 931.0]


In [5]:
import pandas as pd

# Load user log data (assuming 'user_log.csv' exists in your working directory)
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed (assuming they are already named correctly)
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds().fillna(0)

# Group by user_id and aggregate paths and time_spent
grouped = user_log.groupby('user_id').agg({
    'page_name': lambda x: list(x),
    'time_spent': lambda x: list(x)
}).reset_index()

# Save aggregated data to CSV
grouped.to_csv('user_paths.csv', index=False)

print("Data saved to 'user_paths.csv'")


Data saved to 'user_paths.csv'
