In [1]:
import os

In [2]:
%pwd

'd:\\recommendation-engine\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\recommendation-engine'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataGenerationConfig:
    root_dir: Path
    data_dir: Path


In [6]:
from src.hybrid_recommender.constants import *
from src.hybrid_recommender.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self):
        config_filepath = CONFIG_FILE_PATH

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_generation_config(self) -> DataGenerationConfig:
        config = self.config.data_generation

        create_directories([config.root_dir])

        data_generation_config = DataGenerationConfig(
            root_dir=config.root_dir,
            data_dir=config.data_dir,
        )

        return data_generation_config

In [8]:
import os
import urllib.request as request
import zipfile
from src.hybrid_recommender import logger
from src.hybrid_recommender.utils.common import get_size
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import uuid
import numpy as np
import uuid
from pathlib import Path
from typing import List, Dict, Any

In [None]:
class DataGeneration:
    def __init__(self, config: DataGenerationConfig):
        self.config = config
        
    @staticmethod
    def shuffle_ids(ids: List[str]) -> List[str]:
        """Shuffle a list of IDs using numpy's permutation.
        
        Args:
            ids: List of IDs to shuffle
            
        Returns:
            Shuffled list of IDs
        """
        return np.random.permutation(ids)

    def _generate_dataset(self, n_rows: int, id_pools: Dict[str, List[str]]) -> Dict[str, Any]:
        """Generate common dataset structure with random data.
        
        Args:
            n_rows: Number of rows to generate
            id_pools: Dictionary containing pools of user_ids, event_ids, and organizer_ids
            
        Returns:
            Dictionary containing the generated dataset
        """
        return {
            'user_id': np.random.choice(self.shuffle_ids(id_pools['user_ids']), n_rows),
            'event_id': np.random.choice(self.shuffle_ids(id_pools['event_ids']), n_rows),
            'organizer_id': np.random.choice(self.shuffle_ids(id_pools['organizer_ids']), n_rows)
        }

    def generate_files(self) -> None:
        """Generate CSV files with booking, comment, and like data if they don't exist.
        
        Creates DataFrames with random data and saves them to CSV files.
        Handles existing files and logging appropriately.
        """
        if not os.path.exists(self.config.data_dir):
            os.makedirs(self.config.data_dir, exist_ok=True)
            logger.info(f"Generating data to: {self.config.data_dir}")
            
            # Set random seed for reproducibility
            np.random.seed(42)
            
            # Define dataset size and ID pools
            n_rows = 100_000
            id_pools = {
                'user_ids': [str(uuid.uuid4()) for _ in range(5000)],
                'event_ids': [str(uuid.uuid4()) for _ in range(10000)],
                'organizer_ids': [str(uuid.uuid4()) for _ in range(1000)]
            }
            
            # Generate bookings data
            booking_data = self._generate_dataset(n_rows, id_pools)
            booking_data['booking_id'] = [str(uuid.uuid4()) for _ in range(n_rows)]
            df_booking = pd.DataFrame(booking_data).rename(columns={
                'event_id': 'booked_event_id',
                'organizer_id': 'booked_event_organizer_id'
            })
            df_booking.to_csv(os.path.join(self.config.data_dir, 'bookings.csv'), index=False)
            
            # Generate comments data
            comment_data = self._generate_dataset(n_rows, id_pools)
            comment_data['comment_id'] = [str(uuid.uuid4()) for _ in range(n_rows)]
            df_comment = pd.DataFrame(comment_data).rename(columns={
                'event_id': 'commented_event_id',
                'organizer_id': 'commented_event_organizer_id'
            })
            df_comment.to_csv(os.path.join(self.config.data_dir, 'comments.csv'), index=False)
            
            # Generate likes data
            like_data = self._generate_dataset(n_rows, id_pools)
            like_data['like_id'] = [str(uuid.uuid4()) for _ in range(n_rows)]
            df_like = pd.DataFrame(like_data).rename(columns={
                'event_id': 'liked_event_id',
                'organizer_id': 'liked_event_organizer_id'
            })
            df_like.to_csv(os.path.join(self.config.data_dir, 'likes.csv'), index=False)           
            
            logger.info("Generated bookings.csv, comments.csv, and likes.csv with 100,000 rows each.")
        else:
            logger.info(f"Files already exist in: {self.config.data_dir}")
 

In [10]:
try:
    config = ConfigurationManager()
    data_generation_config = config.get_data_generation_config()
    data_generation = DataGeneration(config=data_generation_config)
    data_generation.generate_files()
except Exception as e:
    raise e

[2025-06-18 14:57:32,954: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-06-18 14:57:32,954: INFO: common: created directory at: artifacts]
[2025-06-18 14:57:32,954: INFO: common: created directory at: artifacts/data_ingestion]
[2025-06-18 14:57:32,954: INFO: 1772966207: Generating data to: artifacts/data_ingestion/generated_data]
[2025-06-18 14:57:36,200: INFO: 1772966207: Generated bookings.csv, comments.csv, and likes.csv with 100,000 rows each.]
