In [4]:
import os
import random
import csv
from datetime import datetime, timedelta

# All possible page names from a directory structure
def get_page_names(directory):
    page_names = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                page_name = os.path.splitext(file)[0]
                page_names.append(page_name)
    return page_names

# Generating random timestamps
def generate_random_timestamp(start, end):
    delta = end - start
    random_seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=random_seconds)

# Generating log data with weighted common pages
def generate_log_data(user_count, log_length, common_pages, weight, page_names):
    logs = []
    start_time = datetime(2023, 7, 6, 8, 0, 0)
    end_time = datetime(2024, 7, 6, 23, 0, 0)

    # Adding common pages with weight
    weighted_page_names = common_pages * weight + page_names

    # Ensure each page has at least one log entry
    for page in page_names:
        user_id = random.randint(1, user_count)
        timestamp = generate_random_timestamp(start_time, end_time)
        logs.append({
            "user_id": user_id,
            "page_name": page,
            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S")
        })

    for user_id in range(1, user_count + 1):
        current_time = start_time
        for _ in range(log_length):
            page = random.choice(weighted_page_names)
            logs.append({
                "user_id": user_id,
                "page_name": page,
                "timestamp": current_time.strftime("%Y-%m-%d %H:%M:%S")
            })
            current_time += timedelta(seconds=random.randint(60, 3600))
            if current_time > end_time:
                break

    return logs

# We have taken 10 users with 5 logs each i.e total 50 logs
user_count = 20000
log_length = 5
weight = 5
directory = "Dataset"

# All possible page names from the directory structure
page_names = get_page_names(directory)

# Common pages
common_pages = ["Aorta", "Blood", "Artery"]

# Generating the log data
log_data = generate_log_data(user_count, log_length, common_pages, weight, page_names)

csv_file = 'user_log.csv'

# Writing log data to CSV file
with open(csv_file, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["user_id", "page_name", "timestamp"])
    writer.writeheader()
    for log in log_data:
        writer.writerow(log)

print(f"Log data written to {csv_file}")


Log data written to user_log.csv
