In [36]:
import pandas as pd

# Load user log data
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds()

# Create a mask for NaN values in time_spent
mask = user_log['time_spent'].isna()

# Fill NaN values with 0 temporarily
user_log.loc[mask, 'time_spent'] = 0

# Calculate average time spent per page for each user excluding records where time_spent was initially NaN
average_time_spent = user_log[~mask].groupby('user_id')['time_spent'].mean()

# Replace 0 values in time_spent with average time spent per user
user_log.loc[mask, 'time_spent'] = user_log.loc[mask, 'user_id'].map(average_time_spent)

# Drop unnecessary columns
user_log = user_log.drop(columns=['next_timestamp'])

# Display processed user log data
print(user_log.head())


   user_id                   page_name           timestamp  time_spent
0        1  Sympathetic_nervous_system 2023-07-06 08:00:00      3596.0
1        1                       Blood 2023-07-06 08:59:56      3219.0
2        1                      Neuron 2023-07-06 09:53:35      2186.0
3        1      Central_nervous_system 2023-07-06 10:30:01      3107.0
4        1               Great_vessels 2023-07-06 11:21:48      3027.0


In [37]:
import pandas as pd

# Load user log data (assuming 'user_log.csv' exists in your working directory)
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed (assuming they are already named correctly)
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds().fillna(round(average_time_spent/2))

# Group by user_id and aggregate paths and time_spent
grouped = user_log.groupby('user_id').agg({
    'page_name': lambda x: list(x),
    'time_spent': lambda x: list(x)
}).reset_index()

# Format into the desired output format
for index, row in grouped.iterrows():
    user_id = row['user_id']
    paths = row['page_name']
    time_spents = row['time_spent']
    

counter = 0
for index, row in grouped.iterrows():
    user_id = row['user_id']
    paths = row['page_name']
    time_spents = row['time_spent']
    print(f"{user_id}, {paths}, {time_spents}")
    counter += 1
    if counter >= 5:
        break


1, ['Sympathetic_nervous_system', 'Blood', 'Neuron', 'Central_nervous_system', 'Great_vessels'], [3596.0, 3219.0, 2186.0, 3107.0, 815.0]
2, ['Suspensory_muscle_of_duodenum', 'Bone_marrow', 'Human_digestive_system', 'Aorta', 'Artery'], [226.0, 1993.0, 1848.0, 2598.0, 918.0]
3, ['Gallbladder', 'Aorta', 'Lymphocyte', 'Small_intestine', 'Trachea'], [1023.0, 3527.0, 3278.0, 1666.0, 1148.0]
4, ['Brain', 'Aorta', 'Node_of_Ranvier', 'Artery', 'Great_vessels'], [3148.0, 603.0, 1924.0, 848.0, 1219.0]
5, ['Blood', 'Skin', 'Root_of_the_lung', 'Spinal_nerve', 'Paranasal_sinuses'], [2143.0, 1733.0, 1624.0, 2060.0, 925.0]


In [38]:
import pandas as pd

# Load user log data (assuming 'user_log.csv' exists in your working directory)
user_log = pd.read_csv('user_log.csv')

# Rename columns if needed (assuming they are already named correctly)
user_log.columns = ['user_id', 'page_name', 'timestamp']

# Convert timestamp to datetime
user_log['timestamp'] = pd.to_datetime(user_log['timestamp'])

# Sort data by user_id and timestamp
user_log = user_log.sort_values(by=['user_id', 'timestamp'])

# Calculate time spent on each page
user_log['next_timestamp'] = user_log.groupby('user_id')['timestamp'].shift(-1)
user_log['time_spent'] = (user_log['next_timestamp'] - user_log['timestamp']).dt.total_seconds().fillna(0)

# Group by user_id and aggregate paths and time_spent
grouped = user_log.groupby('user_id').agg({
    'page_name': lambda x: list(x),
    'time_spent': lambda x: list(x)
}).reset_index()

# Save aggregated data to CSV
grouped.to_csv('user_paths.csv', index=False)

print("Data saved to 'user_paths.csv'")


Data saved to 'user_paths.csv'
