In [1]:
import os

In [2]:
%pwd

'd:\\recommendation-engine\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\recommendation-engine'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataGenerationConfig:
    root_dir: Path
    data_dir: Path
    target_multiple_bookings: float = 1.5  # 50% more bookings than base count
    target_multiple_comments: float = 2.0   # 2x more comments than base count
    full_interaction_rate: float = 0.3 


In [6]:
from src.hybrid_recommender.constants import *
from src.hybrid_recommender.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self):
        config_filepath = CONFIG_FILE_PATH

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_generation_config(self) -> DataGenerationConfig:
        config = self.config.data_generation

        create_directories([config.root_dir])

        data_generation_config = DataGenerationConfig(
            root_dir=config.root_dir,
            data_dir=config.data_dir,
            
        )

        return data_generation_config

In [8]:
import os
from src.hybrid_recommender import logger
import pandas as pd
import uuid
import numpy as np
from pathlib import Path
from typing import List, Dict, Any, Tuple

In [9]:
class DataGeneration:
    def __init__(self, config: DataGenerationConfig):
        self.config = config
        
    def _generate_ids(self, count: int) -> np.ndarray:
        """Vectorized UUID generation"""
        return np.array([str(uuid.uuid4()) for _ in range(count)])

    def _generate_dataset(self, n_rows: int, id_pools: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
        """Generate base dataset with optimized numpy operations"""
        return {
            'user_id': np.random.choice(id_pools['user_ids'], n_rows),
            'event_id': np.random.choice(id_pools['event_ids'], n_rows),
            'organizer_id': np.random.choice(id_pools['organizer_ids'], n_rows)
        }

    def _create_additional_bookings(self, base_data: Dict[str, np.ndarray], id_pools: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
        """Create additional bookings with optimized vector operations"""
        current_count = len(base_data['user_id'])
        additional_count = int(current_count * (self.config.target_multiple_bookings - 1))
        
        # Select users who will have multiple bookings
        multi_book_users = np.random.choice(id_pools['user_ids'], 
                                          size=int(len(id_pools['user_ids']) * 0.4), 
                                          replace=False)
        
        # Get their existing bookings to duplicate
        user_bookings = pd.DataFrame(base_data)
        eligible = user_bookings[user_bookings['user_id'].isin(multi_book_users)]
        
        # Sample additional bookings (with replacement to allow multiple duplicates)
        additional = eligible.sample(n=additional_count, replace=True)
        
        # Combine with original
        combined = pd.concat([user_bookings, additional], ignore_index=True)
        return {col: combined[col].values for col in combined.columns}

    def _create_additional_comments(self, base_data: Dict[str, np.ndarray], id_pools: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
        """Create additional comments with optimized vector operations"""
        current_count = len(base_data['user_id'])
        additional_count = int(current_count * (self.config.target_multiple_comments - 1))
        
        # Select active commenters (users who comment more)
        active_commenters = np.random.choice(id_pools['user_ids'], 
                                           size=int(len(id_pools['user_ids']) * 0.6), 
                                           replace=False)
        
        # Get their existing comments to duplicate or create new ones
        user_comments = pd.DataFrame(base_data)
        eligible = user_comments[user_comments['user_id'].isin(active_commenters)]
        
        # Create additional comments (mix of duplicates and new comments)
        dup_count = int(additional_count * 0.7)  # 70% duplicates
        new_count = additional_count - dup_count
        
        # Duplicate existing comments
        duplicates = eligible.sample(n=dup_count, replace=True)
        
        # Create new comments for same users
        new_comments = pd.DataFrame({
            'user_id': np.random.choice(active_commenters, new_count),
            'event_id': np.random.choice(id_pools['event_ids'], new_count),
            'organizer_id': np.random.choice(id_pools['organizer_ids'], new_count)
        })
        
        # Combine all comments
        combined = pd.concat([user_comments, duplicates, new_comments], ignore_index=True)
        return {col: combined[col].values for col in combined.columns}

    def _create_full_interactions(self, booking_data: Dict[str, np.ndarray], 
                                comment_data: Dict[str, np.ndarray], 
                                like_data: Dict[str, np.ndarray]) -> Tuple[Dict, Dict, Dict]:
        """Create full interactions (book + comment + like) for selected events"""
        # Convert to DataFrames for easier manipulation
        df_book = pd.DataFrame(booking_data)
        df_comment = pd.DataFrame(comment_data)
        df_like = pd.DataFrame(like_data)
        
        # Select events for full interactions
        event_sample = df_book['event_id'].drop_duplicates().sample(frac=self.config.full_interaction_rate)
        full_interactions = df_book[df_book['event_id'].isin(event_sample)]
        
        # Add matching comments
        new_comments = full_interactions[['user_id', 'event_id', 'organizer_id']].copy()
        df_comment = pd.concat([df_comment, new_comments], ignore_index=True)
        
        # Add matching likes
        new_likes = full_interactions[['user_id', 'event_id', 'organizer_id']].copy()
        df_like = pd.concat([df_like, new_likes], ignore_index=True)
        
        return (
            {col: df_book[col].values for col in df_book.columns},
            {col: df_comment[col].values for col in df_comment.columns},
            {col: df_like[col].values for col in df_like.columns}
        )

    def generate_files(self) -> None:
        """Generate optimized datasets with increased comments and bookings"""
        if not os.path.exists(self.config.data_dir):
            os.makedirs(self.config.data_dir, exist_ok=True)
            logger.info(f"Generating enhanced data to: {self.config.data_dir}")
            
            np.random.seed(42)
            
            # Base size parameters
            base_size = 100_000
            id_pools = {
                'user_ids': self._generate_ids(5000),
                'event_ids': self._generate_ids(10000),
                'organizer_ids': self._generate_ids(1000)
            }
            
            # Generate base datasets
            booking_data = self._generate_dataset(base_size, id_pools)
            comment_data = self._generate_dataset(base_size, id_pools)
            like_data = self._generate_dataset(base_size, id_pools)
            
            # Enhance with additional bookings and comments
            booking_data = self._create_additional_bookings(booking_data, id_pools)
            comment_data = self._create_additional_comments(comment_data, id_pools)
            
            # Create full interactions
            booking_data, comment_data, like_data = self._create_full_interactions(
                booking_data, comment_data, like_data)
            
            # Add unique IDs using vectorized approach
            datasets = {
                'bookings': (booking_data, {'event_id': 'booked_event_id', 'organizer_id': 'booked_event_organizer_id'}),
                'comments': (comment_data, {'event_id': 'commented_event_id', 'organizer_id': 'commented_event_organizer_id'}),
                'likes': (like_data, {'event_id': 'liked_event_id', 'organizer_id': 'liked_event_organizer_id'})
            }
            
            for name, (data, col_map) in datasets.items():
                data[f'{name[:-1]}_id'] = self._generate_ids(len(data['user_id']))
                df = pd.DataFrame(data).rename(columns=col_map)
                df.to_csv(os.path.join(self.config.data_dir, f'{name}.csv'), index=False)
            
            logger.info(f"Generated enhanced datasets with:")
            logger.info(f"- Bookings: {len(booking_data['user_id'])} (Target: {base_size * self.config.target_multiple_bookings:.0f})")
            logger.info(f"- Comments: {len(comment_data['user_id'])} (Target: {base_size * self.config.target_multiple_comments:.0f})")
            logger.info(f"- Likes: {len(like_data['user_id'])}")
            logger.info(f"- Full interactions: {int(base_size * self.config.full_interaction_rate)} events")
        else:
            logger.info(f"Files already exist in: {self.config.data_dir}")

In [10]:
try:
    config = ConfigurationManager()
    data_generation_config = config.get_data_generation_config()
    data_generation = DataGeneration(config=data_generation_config)
    data_generation.generate_files()
except Exception as e:
    raise e

[2025-06-30 16:36:20,790: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-06-30 16:36:20,790: INFO: common: created directory at: artifacts]
[2025-06-30 16:36:20,790: INFO: common: created directory at: artifacts/data_ingestion]
[2025-06-30 16:36:20,798: INFO: 964962886: Generating enhanced data to: artifacts/data_ingestion/generated_data]
[2025-06-30 16:36:25,665: INFO: 964962886: Generated enhanced datasets with:]
[2025-06-30 16:36:25,665: INFO: 964962886: - Bookings: 150000 (Target: 150000)]
[2025-06-30 16:36:25,665: INFO: 964962886: - Comments: 245397 (Target: 200000)]
[2025-06-30 16:36:25,665: INFO: 964962886: - Likes: 145397]
[2025-06-30 16:36:25,665: INFO: 964962886: - Full interactions: 30000 events]
