In [10]:
import pandas as pd

# Load user log data
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds()

# Create a mask for NaN values in time_spent
mask = user_log['time_spent'].isna()

# Fill NaN values with 0 temporarily
user_log.loc[mask, 'time_spent'] = 0

# Calculate average time spent per page for each user excluding records where time_spent was initially NaN
average_time_spent = user_log[~mask].groupby('user_id')['time_spent'].mean()

# Replace 0 values in time_spent with average time spent per user
user_log.loc[mask, 'time_spent'] = user_log.loc[mask, 'user_id'].map(average_time_spent)

# Drop unnecessary columns
user_log = user_log.drop(columns=['next_timestamp'])

# Display processed user log data
print(user_log.head())


     user_id           page_name           timestamp  time_spent
202        1        Schwann_cell 2023-07-06 08:00:00     2406.00
203        1             Keratin 2023-07-06 08:40:06     1943.00
204        1         Merkel_cell 2023-07-06 09:12:29     2011.00
205        1  Pulmonary_alveolus 2023-07-06 09:46:00      565.00
206        1          Bronchiole 2023-07-06 09:55:25     1731.25


In [12]:
import pandas as pd

# Load user log data (assuming 'user_log.csv' exists in your working directory)
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed (assuming they are already named correctly)
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds().fillna(round(average_time_spent/2))

# Group by user_id and aggregate paths and time_spent
grouped = user_log.groupby('user_id').agg({
    'page_name': lambda x: list(x),
    'time_spent': lambda x: list(x)
}).reset_index()

# Format into the desired output format
for index, row in grouped.iterrows():
    user_id = row['user_id']
    paths = row['page_name']
    time_spents = row['time_spent']
    

counter = 0
for index, row in grouped.iterrows():
    user_id = row['user_id']
    paths = row['page_name']
    time_spents = row['time_spent']
    print(f"{user_id}, {paths}, {time_spents}")
    counter += 1
    if counter >= 5:
        break


1, ['Schwann_cell', 'Keratin', 'Merkel_cell', 'Pulmonary_alveolus', 'Bronchiole'], [2406.0, 1943.0, 2011.0, 565.0, 1224.0]
2, ['Dendrite', 'Spinal_nerve', 'Male_reproductive_system', 'Aorta', 'Stratified_squamous_epithelium'], [2223.0, 3318.0, 1615.0, 1408.0, 750.0]
3, ['Artery', 'Human_tooth', 'Basement_membrane', 'Prostate', 'Merkel_cell'], [3318.0, 576.0, 3458.0, 2783.0, 555.0]
4, ['Blood_vessel', 'Sensory_nerve', 'Artery', 'Mouth', 'Human_reproductive_system'], [1240.0, 1171.0, 3431.0, 1259.0, 812.0]
5, ['Anus', 'Paranasal_sinuses', 'Lacrimal_gland', 'Pituitary_gland', 'Trachea'], [3350.0, 1783.0, 1758.0, 1911.0, 896.0]


In [14]:
import pandas as pd

# Load user log data (assuming 'user_log.csv' exists in your working directory)
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed (assuming they are already named correctly)
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds().fillna(0)

# Group by user_id and aggregate paths and time_spent
grouped = user_log.groupby('user_id').agg({
    'page_name': lambda x: list(x),
    'time_spent': lambda x: list(x)
}).reset_index()

# Save aggregated data to CSV
grouped.to_csv('user_paths.csv', index=False)

print("Data saved to 'user_paths.csv'")


Data saved to 'user_paths.csv'
