In [10]:
import pandas as pd

# Load user log data
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds()

# Create a mask for NaN values in time_spent
mask = user_log['time_spent'].isna()

# Fill NaN values with 0 temporarily
user_log.loc[mask, 'time_spent'] = 0

# Calculate average time spent per page for each user excluding records where time_spent was initially NaN
average_time_spent = user_log[~mask].groupby('user_id')['time_spent'].mean()

# Replace 0 values in time_spent with average time spent per user
user_log.loc[mask, 'time_spent'] = user_log.loc[mask, 'user_id'].map(average_time_spent)

# Drop unnecessary columns
user_log = user_log.drop(columns=['next_timestamp'])

# Display processed user log data
print(user_log.head())


     user_id                  page_name           timestamp  time_spent
191        1  Peripheral_nervous_system 2023-07-06 08:00:00     1563.00
192        1                    Trachea 2023-07-06 08:26:03     1604.00
193        1                      Aorta 2023-07-06 08:52:47     2463.00
194        1         Stratum_granulosum 2023-07-06 09:33:50     1277.00
195        1         Superior_vena_cava 2023-07-06 09:55:07     1726.75


In [12]:
import pandas as pd

# Load user log data (assuming 'user_log.csv' exists in your working directory)
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed (assuming they are already named correctly)
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds().fillna(round(average_time_spent/2))

# Group by user_id and aggregate paths and time_spent
grouped = user_log.groupby('user_id').agg({
    'page_name': lambda x: list(x),
    'time_spent': lambda x: list(x)
}).reset_index()

# Format into the desired output format
for index, row in grouped.iterrows():
    user_id = row['user_id']
    paths = row['page_name']
    time_spents = row['time_spent']
    

counter = 0
for index, row in grouped.iterrows():
    user_id = row['user_id']
    paths = row['page_name']
    time_spents = row['time_spent']
    print(f"{user_id}, {paths}, {time_spents}")
    counter += 1
    if counter >= 5:
        break


1, ['Peripheral_nervous_system', 'Trachea', 'Aorta', 'Stratum_granulosum', 'Superior_vena_cava'], [1563.0, 1604.0, 2463.0, 1277.0, 1451.0]
2, ['Ceruminous_gland', 'Coronary_circulation', 'Capillary', 'Lymphatic_system', 'Trachea'], [2450.0, 1103.0, 1212.0, 1752.0, 762.0]
3, ['Skeleton_in_the_closet', 'Stratum_corneum', 'Mouth', 'Axon', 'Neuron'], [372.0, 587.0, 2010.0, 366.0, 768.0]
4, ['Pancreas', 'Pulmonary_circulation', 'Peripheral_nervous_system', 'Female_reproductive_system', 'Nutrient'], [1717.0, 60.0, 3179.0, 1081.0, 799.0]
5, ['Muscle_relaxant', 'Neural_circuit', 'Subcutaneous_tissue', 'Basement_membrane', 'Salivary_gland'], [3539.0, 2263.0, 1452.0, 1760.0, 677.0]


In [13]:
import pandas as pd

# Load user log data (assuming 'user_log.csv' exists in your working directory)
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed (assuming they are already named correctly)
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds().fillna(0)

# Group by user_id and aggregate paths and time_spent
grouped = user_log.groupby('user_id').agg({
    'page_name': lambda x: list(x),
    'time_spent': lambda x: list(x)
}).reset_index()

# Save aggregated data to CSV
grouped.to_csv('user_paths.csv', index=False)

print("Data saved to 'user_paths.csv'")


Data saved to 'user_paths.csv'
