## cluade

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime
import csv

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Define constants
TOTAL_RECORDS = 100000
RECOMMENDATIONS_PER_USER = 10

# Age distribution
AGE_DISTRIBUTION = {
    (18, 25): 0.25,
    (26, 35): 0.35,
    (36, 45): 0.25,
    (46, 60): 0.12,
    (61, 65): 0.03
}

# Gender distribution
GENDER_DISTRIBUTION = {'Male': 0.52, 'Female': 0.48}

# Geographic distribution
TIER1_CITIES = {
    'Mumbai': 'Maharashtra', 'Delhi': 'Delhi', 'Bangalore': 'Karnataka',
    'Hyderabad': 'Telangana', 'Chennai': 'Tamil Nadu', 'Pune': 'Maharashtra',
    'Kolkata': 'West Bengal', 'Ahmedabad': 'Gujarat', 'Surat': 'Gujarat',
    'Jaipur': 'Rajasthan'
}

TIER2_CITIES = {
    'Lucknow': 'Uttar Pradesh', 'Kanpur': 'Uttar Pradesh', 'Nagpur': 'Maharashtra',
    'Indore': 'Madhya Pradesh', 'Bhopal': 'Madhya Pradesh', 'Visakhapatnam': 'Andhra Pradesh',
    'Patna': 'Bihar', 'Vadodara': 'Gujarat', 'Coimbatore': 'Tamil Nadu', 'Agra': 'Uttar Pradesh'
}

TIER3_CITIES = {
    'Rajkot': 'Gujarat', 'Meerut': 'Uttar Pradesh', 'Nashik': 'Maharashtra',
    'Faridabad': 'Haryana', 'Ghaziabad': 'Uttar Pradesh', 'Jabalpur': 'Madhya Pradesh',
    'Ranchi': 'Jharkhand', 'Mysore': 'Karnataka', 'Jodhpur': 'Rajasthan', 'Kota': 'Rajasthan'
}

# Product categories and brands
PRODUCTS = {
    'Electronics': {
        'categories': ['Smartphones', 'Laptops', 'Headphones', 'Smart Watches', 'Tablets', 'Cameras'],
        'brands': ['Samsung', 'Apple', 'OnePlus', 'Xiaomi', 'Realme', 'Vivo', 'HP', 'Dell', 'Lenovo', 'Sony', 'JBL', 'Boat'],
        'price_ranges': {
            'Smartphones': (8000, 120000),
            'Laptops': (25000, 150000),
            'Headphones': (500, 25000),
            'Smart Watches': (2000, 45000),
            'Tablets': (8000, 80000),
            'Cameras': (15000, 200000)
        }
    },
    'Fashion': {
        'categories': ['T-shirts', 'Jeans', 'Kurtas', 'Sarees', 'Dresses', 'Footwear', 'Ethnic Wear'],
        'brands': ['H&M', 'Zara', 'Nike', 'Adidas', 'Puma', 'Fabindia', 'W', 'Biba', 'Allen Solly', 'Van Heusen', 'Levis', 'Pepe Jeans'],
        'price_ranges': {
            'T-shirts': (300, 3000),
            'Jeans': (800, 8000),
            'Kurtas': (500, 5000),
            'Sarees': (1000, 15000),
            'Dresses': (800, 8000),
            'Footwear': (500, 12000),
            'Ethnic Wear': (800, 10000)
        }
    },
    'Home': {
        'categories': ['Furniture', 'Kitchen Appliances', 'Home Decor', 'Bedding', 'Storage Solutions'],
        'brands': ['IKEA', 'Godrej', 'Whirlpool', 'LG', 'Prestige', 'Pigeon', 'Home Centre', 'Fabfurnish'],
        'price_ranges': {
            'Furniture': (2000, 50000),
            'Kitchen Appliances': (1000, 25000),
            'Home Decor': (300, 8000),
            'Bedding': (500, 5000),
            'Storage Solutions': (400, 4000)
        }
    },
    'Beauty': {
        'categories': ['Skincare', 'Makeup', 'Haircare', 'Perfumes', 'Grooming Products'],
        'brands': ['Lakme', 'Maybelline', 'LOreal', 'Nivea', 'Dove', 'Garnier', 'Biotique', 'Mamaearth', 'Nykaa', 'Sugar Cosmetics'],
        'price_ranges': {
            'Skincare': (200, 3000),
            'Makeup': (150, 2500),
            'Haircare': (100, 1500),
            'Perfumes': (300, 5000),
            'Grooming Products': (100, 2000)
        }
    },
    'Books': {
        'categories': ['Fiction', 'Non-fiction', 'Academic', 'Self-help', 'Childrens Books'],
        'brands': ['Penguin', 'Harper Collins', 'Classmate', 'Reynolds', 'Parker'],
        'price_ranges': {
            'Fiction': (200, 800),
            'Non-fiction': (300, 1200),
            'Academic': (400, 2000),
            'Self-help': (250, 900),
            'Childrens Books': (150, 600)
        }
    },
    'Sports': {
        'categories': ['Gym Equipment', 'Sportswear', 'Outdoor Gear', 'Yoga Accessories'],
        'brands': ['Nike', 'Adidas', 'Puma', 'Reebok', 'Decathlon', 'Nivia', 'Cosco'],
        'price_ranges': {
            'Gym Equipment': (500, 15000),
            'Sportswear': (400, 4000),
            'Outdoor Gear': (800, 12000),
            'Yoga Accessories': (200, 2000)
        }
    },
    'Grocery': {
        'categories': ['Snacks', 'Beverages', 'Organic Products', 'Staples'],
        'brands': ['Britannia', 'Parle', 'Nestle', 'Amul', 'Tata', 'ITC', 'Patanjali', 'Organic India'],
        'price_ranges': {
            'Snacks': (20, 500),
            'Beverages': (25, 300),
            'Organic Products': (100, 1000),
            'Staples': (50, 800)
        }
    }
}

# Demographic preferences
AGE_PREFERENCES = {
    (18, 25): {'Electronics': 0.30, 'Fashion': 0.40, 'Beauty': 0.20, 'Books': 0.10},
    (26, 35): {'Electronics': 0.25, 'Fashion': 0.30, 'Home': 0.20, 'Sports': 0.15, 'Beauty': 0.10},
    (36, 45): {'Home': 0.35, 'Electronics': 0.20, 'Fashion': 0.20, 'Grocery': 0.15, 'Sports': 0.10},
    (46, 60): {'Home': 0.30, 'Grocery': 0.25, 'Electronics': 0.15, 'Fashion': 0.15, 'Books': 0.15},
    (61, 65): {'Grocery': 0.40, 'Home': 0.25, 'Books': 0.20, 'Electronics': 0.10, 'Beauty': 0.05}
}

GENDER_PREFERENCES = {
    'Male': {'Electronics': 0.30, 'Sports': 0.20, 'Fashion': 0.25, 'Home': 0.15, 'Books': 0.10},
    'Female': {'Fashion': 0.35, 'Beauty': 0.25, 'Home': 0.20, 'Electronics': 0.10, 'Books': 0.10}
}

def get_age_range(age):
    for age_range in AGE_PREFERENCES.keys():
        if age_range[0] <= age <= age_range[1]:
            return age_range
    return (18, 25)  # default

def generate_demographics():
    """Generate user demographics based on specified distributions"""
    users = []
    
    # Calculate city distributions
    tier1_count = int(TOTAL_RECORDS * 0.40)
    tier2_count = int(TOTAL_RECORDS * 0.35)
    tier3_count = TOTAL_RECORDS - tier1_count - tier2_count
    
    # Generate city assignments
    cities = []
    cities.extend([('Tier1', city, state) for city, state in TIER1_CITIES.items()] * (tier1_count // len(TIER1_CITIES) + 1))
    cities.extend([('Tier2', city, state) for city, state in TIER2_CITIES.items()] * (tier2_count // len(TIER2_CITIES) + 1))
    cities.extend([('Tier3', city, state) for city, state in TIER3_CITIES.items()] * (tier3_count // len(TIER3_CITIES) + 1))
    
    # Shuffle and trim to exact count
    random.shuffle(cities)
    cities = cities[:TOTAL_RECORDS]
    
    for i in range(TOTAL_RECORDS):
        # Generate age based on distribution
        age_ranges = list(AGE_DISTRIBUTION.keys())
        age_weights = list(AGE_DISTRIBUTION.values())
        selected_range = np.random.choice(len(age_ranges), p=age_weights)
        age_min, age_max = age_ranges[selected_range]
        age = random.randint(age_min, age_max)
        
        # Generate gender
        gender = np.random.choice(['Male', 'Female'], p=[0.52, 0.48])
        
        # Get city info
        tier, city, state = cities[i]
        
        users.append({
            'user_id': f'USER_{i+1:06d}',
            'age': age,
            'gender': gender,
            'state': state,
            'city': city,
            'tier': tier
        })
    
    return users

def get_category_preferences(age, gender, tier):
    """Calculate category preferences based on demographics"""
    age_range = get_age_range(age)
    
    # Start with age preferences
    preferences = AGE_PREFERENCES.get(age_range, {}).copy()
    
    # Adjust based on gender
    gender_prefs = GENDER_PREFERENCES.get(gender, {})
    for category, weight in gender_prefs.items():
        if category in preferences:
            preferences[category] = (preferences[category] + weight) / 2
        else:
            preferences[category] = weight * 0.5
    
    # Adjust based on tier (premium vs budget)
    tier_multiplier = {'Tier1': 1.2, 'Tier2': 1.0, 'Tier3': 0.8}
    premium_categories = ['Electronics', 'Beauty', 'Fashion']
    
    for category in premium_categories:
        if category in preferences:
            preferences[category] *= tier_multiplier[tier]
    
    # Normalize preferences
    total = sum(preferences.values())
    if total > 0:
        preferences = {k: v/total for k, v in preferences.items()}
    
    return preferences

def generate_product_name(category, brand, tier):
    """Generate realistic product names"""
    product_templates = {
        'Smartphones': ['{brand} Galaxy {model}', '{brand} {model} Pro', '{brand} {model} 5G'],
        'Laptops': ['{brand} {model} Laptop', '{brand} {model} Series', '{brand} {model} Pro'],
        'Headphones': ['{brand} {model} Wireless', '{brand} {model} Pro', '{brand} {model} Studio'],
        'Smart Watches': ['{brand} Watch {model}', '{brand} {model} Smart', '{brand} Fit {model}'],
        'Tablets': ['{brand} Tab {model}', '{brand} {model} Tablet', '{brand} Pad {model}'],
        'Cameras': ['{brand} {model} DSLR', '{brand} {model} Camera', '{brand} {model} Pro'],
        'T-shirts': ['{brand} {model} Tee', '{brand} {model} T-shirt', '{brand} Cotton {model}'],
        'Jeans': ['{brand} {model} Jeans', '{brand} Slim {model}', '{brand} {model} Fit'],
        'Kurtas': ['{brand} Cotton Kurta', '{brand} {model} Kurta', '{brand} Traditional {model}'],
        'Sarees': ['{brand} Silk Saree', '{brand} {model} Saree', '{brand} Traditional {model}'],
        'Dresses': ['{brand} {model} Dress', '{brand} Summer {model}', '{brand} {model} Collection'],
        'Footwear': ['{brand} {model} Shoes', '{brand} {model} Sneakers', '{brand} {model} Collection'],
        'Ethnic Wear': ['{brand} {model} Set', '{brand} Traditional {model}', '{brand} Ethnic {model}']
    }
    
    # Default template for categories not specified
    default_template = '{brand} {model}'
    template = random.choice(product_templates.get(category, [default_template]))
    
    # Generate model names based on tier
    if tier == 'Tier1':
        models = ['Premium', 'Elite', 'Pro Max', 'Ultra', 'Platinum', 'Signature']
    elif tier == 'Tier2':
        models = ['Classic', 'Standard', 'Pro', 'Plus', 'Essential', 'Prime']
    else:
        models = ['Basic', 'Lite', 'Standard', 'Essential', 'Value', 'Core']
    
    model = random.choice(models)
    return template.format(brand=brand, model=model)

def generate_price(category, tier, base_price_range):
    """Generate realistic prices based on tier and category"""
    min_price, max_price = base_price_range
    
    # Tier-based price adjustment
    tier_multipliers = {
        'Tier1': (1.2, 1.5),  # Premium pricing
        'Tier2': (0.9, 1.2),  # Standard pricing
        'Tier3': (0.6, 0.9)   # Budget pricing
    }
    
    tier_min, tier_max = tier_multipliers[tier]
    adjusted_min = int(min_price * tier_min)
    adjusted_max = int(max_price * tier_max)
    
    # Add some randomness
    price = random.randint(adjusted_min, adjusted_max)
    
    # Round to nearest 10 for realistic pricing
    return round(price / 10) * 10

def generate_recommendations(user):
    """Generate 10 product recommendations for a user"""
    age = user['age']
    gender = user['gender']
    tier = user['tier']
    
    preferences = get_category_preferences(age, gender, tier)
    
    recommendations = []
    used_products = set()  # Avoid duplicate products
    
    # Generate recommendations based on preferences
    categories = list(PRODUCTS.keys())
    
    for i in range(RECOMMENDATIONS_PER_USER):
        # Select category based on preferences
        if preferences:
            category_weights = [preferences.get(cat, 0.01) for cat in categories]
            selected_category = np.random.choice(categories, p=np.array(category_weights)/sum(category_weights))
        else:
            selected_category = random.choice(categories)
        
        # Select subcategory and brand
        subcategories = PRODUCTS[selected_category]['categories']
        brands = PRODUCTS[selected_category]['brands']
        
        subcategory = random.choice(subcategories)
        brand = random.choice(brands)
        
        # Generate unique product name
        attempts = 0
        while attempts < 10:  # Avoid infinite loop
            product_name = generate_product_name(subcategory, brand, tier)
            if product_name not in used_products:
                used_products.add(product_name)
                break
            attempts += 1
        
        # Generate price
        price_range = PRODUCTS[selected_category]['price_ranges'].get(subcategory, (100, 1000))
        price = generate_price(subcategory, tier, price_range)
        
        # Generate recommendation score (higher for preferred categories)
        base_score = preferences.get(selected_category, 0.1)
        score = min(1.0, base_score + random.uniform(-0.1, 0.2))
        score = max(0.1, score)  # Ensure minimum score
        score = round(score, 2)
        
        recommendations.append({
            'product': product_name,
            'category': subcategory,
            'brand': brand,
            'price': price,
            'score': score
        })
    
    return recommendations

def create_dataset():
    """Create the complete dataset"""
    print("Generating user demographics...")
    users = generate_demographics()
    
    print("Generating product recommendations...")
    dataset = []
    
    for i, user in enumerate(users):
        if i % 10000 == 0:
            print(f"Processing user {i+1}/{TOTAL_RECORDS}")
        
        recommendations = generate_recommendations(user)
        
        # Create row with all data
        row = {
            'user_id': user['user_id'],
            'age': user['age'],
            'gender': user['gender'],
            'state': user['state'],
            'city': user['city'],
            'tier': user['tier']
        }
        
        # Add recommendations
        for j, rec in enumerate(recommendations, 1):
            row[f'product_{j}'] = rec['product']
            row[f'category_{j}'] = rec['category']
            row[f'brand_{j}'] = rec['brand']
            row[f'price_{j}'] = rec['price']
            row[f'recommendation_score_{j}'] = rec['score']
        
        dataset.append(row)
    
    return dataset

def save_dataset(dataset, filename='product_recommendation_dataset.csv'):
    """Save dataset to CSV file"""
    print(f"Saving dataset to {filename}...")
    
    # Define column order
    columns = ['user_id', 'age', 'gender', 'state', 'city', 'tier']
    
    # Add product columns
    for i in range(1, RECOMMENDATIONS_PER_USER + 1):
        columns.extend([
            f'product_{i}', f'category_{i}', f'brand_{i}', 
            f'price_{i}', f'recommendation_score_{i}'
        ])
    
    df = pd.DataFrame(dataset)
    df = df[columns]  # Ensure column order
    df.to_csv(filename, index=False, encoding='utf-8')
    
    print(f"Dataset saved successfully with {len(dataset)} records!")
    
    # Print sample statistics
    print(f"\nDataset Statistics:")
    print(f"Total records: {len(df)}")
    print(f"Age distribution:")
    print(df['age'].value_counts().sort_index().head(10))
    print(f"\nGender distribution:")
    print(df['gender'].value_counts())
    print(f"\nTier distribution:")
    print(df['tier'].value_counts())
    print(f"\nSample records:")
    print(df.head(3).to_string())

if __name__ == "__main__":
    # Generate the dataset
    dataset = create_dataset()
    
    # Save to CSV
    save_dataset(dataset)
    
    print("\nDataset generation completed successfully!")
    print("The dataset is ready for use in building recommendation systems.")
    print("Features included:")
    print("- Realistic demographic distributions")
    print("- Location-based preferences")
    print("- Age and gender-based product preferences")
    print("- Tier-based pricing")
    print("- 10 personalized recommendations per user")
    print("- Ready for ML model training")

Generating user demographics...
Generating product recommendations...
Processing user 1/100000
Processing user 10001/100000
Processing user 20001/100000
Processing user 30001/100000
Processing user 40001/100000
Processing user 50001/100000
Processing user 60001/100000
Processing user 70001/100000
Processing user 80001/100000
Processing user 90001/100000
Saving dataset to product_recommendation_dataset.csv...
Dataset saved successfully with 100000 records!

Dataset Statistics:
Total records: 100000
Age distribution:
age
18    3056
19    3176
20    3165
21    3211
22    3089
23    3068
24    3020
25    3174
26    3484
27    3479
Name: count, dtype: int64

Gender distribution:
gender
Male      51836
Female    48164
Name: count, dtype: int64

Tier distribution:
tier
Tier1    39995
Tier2    35005
Tier3    25000
Name: count, dtype: int64

Sample records:
       user_id  age  gender           state       city   tier              product_1     category_1    brand_1  price_1  recommendation_sco

## Grok

In [2]:
import pandas as pd
import numpy as np
import random
from uuid import uuid4

# Constants and Distributions
NUM_USERS = 100000
AGE_GROUPS = {
    "18-25": (0.25, 18, 25), "26-35": (0.35, 26, 35), "36-45": (0.25, 36, 45),
    "46-60": (0.12, 46, 60), "61-65": (0.03, 61, 65)
}
GENDER_DISTRIBUTION = {"Male": 0.52, "Female": 0.48}
TIER_DISTRIBUTION = {"Tier1": 0.4, "Tier2": 0.35, "Tier3": 0.25}

# City and State Mappings by Tier
TIER_CITIES = {
    "Tier1": [("Mumbai", "Maharashtra"), ("Delhi", "Delhi"), ("Bangalore", "Karnataka"),
              ("Hyderabad", "Telangana"), ("Chennai", "Tamil Nadu"), ("Pune", "Maharashtra"),
              ("Kolkata", "West Bengal"), ("Ahmedabad", "Gujarat"), ("Surat", "Gujarat"),
              ("Jaipur", "Rajasthan")],
    "Tier2": [("Lucknow", "Uttar Pradesh"), ("Kanpur", "Uttar Pradesh"), ("Nagpur", "Maharashtra"),
              ("Indore", "Madhya Pradesh"), ("Bhopal", "Madhya Pradesh"), ("Visakhapatnam", "Andhra Pradesh"),
              ("Patna", "Bihar"), ("Vadodara", "Gujarat"), ("Coimbatore", "Tamil Nadu"), ("Agra", "Uttar Pradesh")],
    "Tier3": [("Rajkot", "Gujarat"), ("Meerut", "Uttar Pradesh"), ("Nashik", "Maharashtra"),
              ("Faridabad", "Haryana"), ("Ghaziabad", "Uttar Pradesh"), ("Jabalpur", "Madhya Pradesh"),
              ("Ranchi", "Jharkhand"), ("Mysore", "Karnataka"), ("Jodhpur", "Rajasthan"), ("Kota", "Rajasthan")]
}

# Demographic-Based Category Preferences
AGE_GROUP_CATEGORIES = {
    "18-25": {"Electronics": 0.3, "Fashion": 0.4, "Beauty": 0.2, "Books": 0.1, "Home": 0, "Sports": 0, "Grocery": 0},
    "26-35": {"Electronics": 0.25, "Fashion": 0.3, "Home": 0.2, "Sports": 0.15, "Beauty": 0.1, "Books": 0, "Grocery": 0},
    "36-45": {"Home": 0.35, "Electronics": 0.2, "Fashion": 0.2, "Grocery": 0.15, "Sports": 0.1, "Beauty": 0, "Books": 0},
    "46-60": {"Home": 0.3, "Grocery": 0.25, "Electronics": 0.15, "Fashion": 0.15, "Books": 0.15, "Beauty": 0, "Sports": 0},
    "61-65": {"Grocery": 0.4, "Home": 0.25, "Books": 0.2, "Electronics": 0.1, "Beauty": 0.05, "Fashion": 0, "Sports": 0}
}
GENDER_CATEGORIES = {
    "Male": {"Electronics": 0.3, "Sports": 0.2, "Fashion": 0.25, "Home": 0.15, "Books": 0.1, "Beauty": 0, "Grocery": 0},
    "Female": {"Fashion": 0.35, "Beauty": 0.25, "Home": 0.2, "Electronics": 0.1, "Books": 0.1, "Sports": 0, "Grocery": 0}
}

# Tier-Based Price Range Preferences
TIER_PRICE_RANGES = {
    "Tier1": {"Budget": 0.1, "Mid-range": 0.3, "Premium": 0.4, "Luxury": 0.2},
    "Tier2": {"Budget": 0.3, "Mid-range": 0.4, "Premium": 0.2, "Luxury": 0.1},
    "Tier3": {"Budget": 0.5, "Mid-range": 0.3, "Premium": 0.15, "Luxury": 0.05}
}
PRICE_RANGE_LIMITS = {
    "Budget": (0, 1000), "Mid-range": (1001, 5000), "Premium": (5001, 15000), "Luxury": (15001, 100000)
}

# Product Categories and Details
PRODUCT_CATEGORIES = {
    "Electronics": {
        "brands": ["Samsung", "Apple", "OnePlus", "Xiaomi", "Realme", "Vivo", "HP", "Dell", "Lenovo", "Sony", "JBL", "Boat"],
        "types": ["Smartphone", "Laptop", "Headphones", "Smart Watch", "Tablet", "Camera"],
        "price_dist": {"Budget": 0.1, "Mid-range": 0.4, "Premium": 0.4, "Luxury": 0.1},
        "gender": "Both"
    },
    "Fashion": {
        "brands": ["H&M", "Zara", "Nike", "Adidas", "Puma", "Fabindia", "W", "Biba", "Allen Solly", "Van Heusen", "Levi's", "Pepe Jeans"],
        "types": ["Men's T-shirt", "Women's T-shirt", "Men's Jeans", "Women's Jeans", "Men's Kurta", "Women's Kurta", "Saree", "Dress", "Men's Footwear", "Women's Footwear"],
        "price_dist": {"Budget": 0.2, "Mid-range": 0.5, "Premium": 0.2, "Luxury": 0.1},
        "gender": "Specific"
    },
    "Home": {
        "brands": ["IKEA", "Godrej", "Whirlpool", "LG", "Prestige", "Pigeon", "Home Centre", "Fabfurnish"],
        "types": ["Furniture", "Kitchen Appliance", "Home Decor", "Bedding", "Storage Solution"],
        "price_dist": {"Budget": 0.15, "Mid-range": 0.45, "Premium": 0.3, "Luxury": 0.1},
        "gender": "Both"
    },
    "Beauty": {
        "brands": ["Lakme", "Maybelline", "L'Oreal", "Nivea", "Dove", "Garnier", "Biotique", "Mamaearth", "Nykaa", "Sugar Cosmetics"],
        "types": ["Men's Skincare", "Women's Skincare", "Men's Haircare", "Women's Haircare", "Women's Makeup", "Men's Grooming Kit", "Men's Perfume", "Women's Perfume"],
        "price_dist": {"Budget": 0.3, "Mid-range": 0.4, "Premium": 0.2, "Luxury": 0.1},
        "gender": "Specific"
    },
    "Books": {
        "brands": ["Penguin", "Harper Collins", "Classmate", "Reynolds", "Parker"],
        "types": ["Fiction", "Non-fiction", "Academic", "Self-help", "Children's Book"],
        "price_dist": {"Budget": 0.5, "Mid-range": 0.4, "Premium": 0.1, "Luxury": 0},
        "gender": "Both"
    },
    "Sports": {
        "brands": ["Nike", "Adidas", "Puma", "Reebok", "Decathlon", "Nivia", "Cosco"],
        "types": ["Gym Equipment", "Sportswear", "Outdoor Gear", "Yoga Accessories"],
        "price_dist": {"Budget": 0.2, "Mid-range": 0.5, "Premium": 0.2, "Luxury": 0.1},
        "gender": "Both"
    },
    "Grocery": {
        "brands": ["Britannia", "Parle", "Nestle", "Amul", "Tata", "ITC", "Patanjali", "Organic India"],
        "types": ["Snacks", "Beverages", "Organic Product", "Staples"],
        "price_dist": {"Budget": 0.6, "Mid-range": 0.3, "Premium": 0.1, "Luxury": 0},
        "gender": "Both"
    }
}

# Generate Product Pool
def generate_product_pool():
    products = []
    for category, details in PRODUCT_CATEGORIES.items():
        for _ in range(1000):  # Generate 1000 products per category
            brand = random.choice(details["brands"])
            product_type = random.choice(details["types"])
            price_range = np.random.choice(list(details["price_dist"].keys()), p=list(details["price_dist"].values()))
            min_price, max_price = PRICE_RANGE_LIMITS[price_range]
            price = round(random.uniform(min_price, max_price), 2)
            name = f"{brand} {product_type}"
            gender_target = "Male" if "Men's" in product_type else "Female" if "Women's" in product_type or "Saree" in product_type else "Both"
            products.append({"category": category, "brand": brand, "name": name, "price": price, "gender_target": gender_target})
    return products

PRODUCT_POOL = generate_product_pool()

# Helper Functions
def get_age_group(age):
    for group, (_, min_age, max_age) in AGE_GROUPS.items():
        if min_age <= age <= max_age:
            return group
    return None

def compute_category_probs(age_group, gender):
    age_probs = AGE_GROUP_CATEGORIES[age_group]
    gender_probs = GENDER_CATEGORIES[gender]
    categories = set(age_probs.keys()).union(gender_probs.keys())
    probs = {cat: (age_probs.get(cat, 0) + gender_probs.get(cat, 0)) / 2 for cat in categories}
    return probs

def generate_recommendations(tier, gender, category_probs):
    recs = []
    used_products = set()
    categories = list(category_probs.keys())
    probs = list(category_probs.values())
    for _ in range(10):
        category = np.random.choice(categories, p=probs)
        price_range = np.random.choice(list(TIER_PRICE_RANGES[tier].keys()), p=list(TIER_PRICE_RANGES[tier].values()))
        min_price, max_price = PRICE_RANGE_LIMITS[price_range]
        valid_products = [p for p in PRODUCT_POOL if p["category"] == category and 
                          min_price <= p["price"] <= max_price and 
                          (p["gender_target"] == gender or p["gender_target"] == "Both") and 
                          p["name"] not in used_products]
        if not valid_products:
            valid_products = [p for p in PRODUCT_POOL if p["category"] == category and 
                              (p["gender_target"] == gender or p["gender_target"] == "Both") and 
                              p["name"] not in used_products]
        product = random.choice(valid_products) if valid_products else random.choice(PRODUCT_POOL)
        used_products.add(product["name"])
        recs.append({
            "product": product["name"], "category": category, "brand": product["brand"],
            "price": product["price"], "score": round(random.uniform(0.1, 1.0), 2)
        })
    return recs

# Generate Dataset
data = []
for i in range(NUM_USERS):
    user_id = f"USER_{str(i+1).zfill(6)}"
    age_group = np.random.choice(list(AGE_GROUPS.keys()), p=[p[0] for p in AGE_GROUPS.values()])
    age = random.randint(AGE_GROUPS[age_group][1], AGE_GROUPS[age_group][2])
    gender = np.random.choice(list(GENDER_DISTRIBUTION.keys()), p=list(GENDER_DISTRIBUTION.values()))
    tier = np.random.choice(list(TIER_DISTRIBUTION.keys()), p=list(TIER_DISTRIBUTION.values()))
    city, state = random.choice(TIER_CITIES[tier])
    
    category_probs = compute_category_probs(age_group, gender)
    recs = generate_recommendations(tier, gender, category_probs)
    
    row = [user_id, age, gender, state, city, tier]
    for j in range(10):
        row.extend([recs[j]["product"], recs[j]["category"], recs[j]["brand"], recs[j]["price"], recs[j]["score"]])
    data.append(row)

# Create DataFrame
columns = ["user_id", "age", "gender", "state", "city", "tier"]
for i in range(1, 11):
    columns.extend([f"product_{i}", f"category_{i}", f"brand_{i}", f"price_{i}", f"recommendation_score_{i}"])
df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv("grok_recommendation_dataset.csv", index=False, encoding="utf-8")

print("Dataset generated and saved as 'grok_recommendation_dataset.csv'")

Dataset generated and saved as 'grok_recommendation_dataset.csv'


## LLAMA

In [3]:
import pandas as pd
import numpy as np
import random

# Define constants
TOTAL_RECORDS = 100000
AGE_GROUPS = [(18, 25), (26, 35), (36, 45), (46, 60), (60, 65)]
GENDER_DISTRIBUTION = [0.52, 0.48]
TIER_DISTRIBUTION = [0.4, 0.35, 0.25]
CATEGORIES = ['Electronics', 'Fashion & Apparel', 'Home & Kitchen', 'Beauty & Personal Care', 'Books & Stationery', 'Sports & Fitness', 'Grocery & Food']
BRANDS = {
    'Electronics': ['Samsung', 'Apple', 'OnePlus', 'Xiaomi', 'Realme', 'Vivo', 'HP', 'Dell', 'Lenovo', 'Sony', 'JBL', 'Boat'],
    'Fashion & Apparel': ['H&M', 'Zara', 'Nike', 'Adidas', 'Puma', 'Fabindia', 'W', 'Biba', 'Allen Solly', 'Van Heusen', 'Levi\'s', 'Pepe Jeans'],
    'Home & Kitchen': ['IKEA', 'Godrej', 'Whirlpool', 'LG', 'Prestige', 'Pigeon', 'Home Centre', 'Fabfurnish'],
    'Beauty & Personal Care': ['Lakme', 'Maybelline', 'L\'Oreal', 'Nivea', 'Dove', 'Garnier', 'Biotique', 'Mamaearth', 'Nykaa', 'Sugar Cosmetics'],
    'Books & Stationery': ['Penguin', 'Harper Collins', 'Classmate', 'Reynolds', 'Parker'],
    'Sports & Fitness': ['Nike', 'Adidas', 'Puma', 'Reebok', 'Decathlon', 'Nivia', 'Cosco'],
    'Grocery & Food': ['Britannia', 'Parle', 'Nestle', 'Amul', 'Tata', 'ITC', 'Patanjali', 'Organic India']
}
PRICE_RANGES = [(0, 1000), (1001, 5000), (5001, 15000), (15001, float('inf'))]
PRICE_DISTRIBUTION = [0.3, 0.4, 0.2, 0.1]

# Define city and state names
states = ['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttarakhand', 'Uttar Pradesh', 'West Bengal']
cities = {
    'Tier 1': ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Chennai', 'Pune', 'Kolkata', 'Ahmedabad', 'Surat', 'Jaipur'],
    'Tier 2': ['Lucknow', 'Kanpur', 'Nagpur', 'Indore', 'Bhopal', 'Visakhapatnam', 'Patna', 'Vadodara', 'Coimbatore', 'Agra'],
    'Tier 3': ['Rajkot', 'Meerut', 'Nashik', 'Faridabad', 'Ghaziabad', 'Jabalpur', 'Ranchi', 'Mysore', 'Jodhpur', 'Kota']
}

# Function to generate user demographics
def generate_user_demographics():
    user_id = f'USER_{str(random.randint(1, TOTAL_RECORDS)).zfill(6)}'
    age_group = random.choices(AGE_GROUPS, weights=[0.25, 0.35, 0.25, 0.12, 0.03])[0]
    age = random.randint(age_group[0], age_group[1])
    gender = 'Male' if random.random() < GENDER_DISTRIBUTION[0] else 'Female'
    state = random.choice(states)
    city_tier = random.choices(list(cities.keys()), weights=TIER_DISTRIBUTION)[0]
    city = random.choice(cities[city_tier])
    return user_id, age, gender, state, city, city_tier

# Function to generate product recommendations
def generate_product_recommendations(age, gender, city_tier):
    recommendations = []
    for _ in range(10):
        category = random.choice(CATEGORIES)
        brand = random.choice(BRANDS[category])
        price_range = random.choices(PRICE_RANGES, weights=PRICE_DISTRIBUTION)[0]
        price = round(random.uniform(price_range[0], price_range[1]), 2)
        product_name = f'{brand} {random.choice(["Product", "Item", "Device", "Accessory"])}'
        recommendation_score = round(random.uniform(0.1, 1.0), 2)
        recommendations.append({
            'product': product_name,
            'category': category,
            'brand': brand,
            'price': price,
            'recommendation_score': recommendation_score
        })
    return recommendations

# Generate dataset
data = []
for _ in range(TOTAL_RECORDS):
    user_id, age, gender, state, city, city_tier = generate_user_demographics()
    recommendations = generate_product_recommendations(age, gender, city_tier)
    row = {
        'user_id': user_id,
        'age': age,
        'gender': gender,
        'state': state,
        'city': city,
        'tier': city_tier
    }
    for i, recommendation in enumerate(recommendations):
        row[f'product_{i+1}'] = recommendation['product']
        row[f'category_{i+1}'] = recommendation['category']
        row[f'brand_{i+1}'] = recommendation['brand']
        row[f'price_{i+1}'] = recommendation['price']
        row[f'recommendation_score_{i+1}'] = recommendation['recommendation_score']
    data.append(row)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv('llama_product_recommendation_dataset.csv', index=False)


## GPT

In [7]:
!python /home/chichi/RS_ML/gpt_data_gen.py --rows 100000

...10,000 rows generated
...20,000 rows generated
...30,000 rows generated
...40,000 rows generated
...50,000 rows generated
...60,000 rows generated
...70,000 rows generated
...80,000 rows generated
...90,000 rows generated
...100,000 rows generated

✅ Finished 100,000 rows in 9.0s → 'gpt_product_recommendations.csv'


In [10]:
import pandas as pd

claude = pd.read_csv('/home/chichi/RS_ML/product_recommendation_dataset.csv')
llama = pd.read_csv('/home/chichi/RS_ML/llama_product_recommendation_dataset.csv')
gpt = pd.read_csv('/home/chichi/RS_ML/gpt_product_recommendations.csv')
grok = pd.read_csv('/home/chichi/RS_ML/grok_recommendation_dataset.csv')

In [11]:
claude.head()

Unnamed: 0,user_id,age,gender,state,city,tier,product_1,category_1,brand_1,price_1,...,product_9,category_9,brand_9,price_9,recommendation_score_9,product_10,category_10,brand_10,price_10,recommendation_score_10
0,USER_000001,34,Female,Madhya Pradesh,Indore,Tier2,H&M Essential Shoes,Footwear,H&M,6700,...,Home Centre Standard,Furniture,Home Centre,6280,0.29,Lenovo Tab Classic,Tablets,Lenovo,14140,0.23
1,USER_000002,43,Female,Gujarat,Ahmedabad,Tier1,Lenovo Platinum Smart,Smart Watches,Lenovo,30380,...,W Pro Max Kurta,Kurtas,W,3230,0.31,Pepe Jeans Traditional Signature,Kurtas,Pepe Jeans,6080,0.45
2,USER_000003,19,Male,Uttar Pradesh,Kanpur,Tier2,Mamaearth Pro,Haircare,Mamaearth,130,...,JBL Standard Studio,Headphones,JBL,5220,0.46,Apple Watch Essential,Smart Watches,Apple,43220,0.43
3,USER_000004,19,Female,Delhi,Delhi,Tier1,Biba Silk Saree,Sarees,Biba,14440,...,Penguin Ultra,Academic,Penguin,540,0.1,Pepe Jeans Traditional Pro Max,Kurtas,Pepe Jeans,6760,0.37
4,USER_000005,39,Female,Tamil Nadu,Chennai,Tier1,Pigeon Platinum,Furniture,Pigeon,42660,...,Fabindia Cotton Kurta,Kurtas,Fabindia,1220,0.36,Lakme Pro Max,Skincare,Lakme,2700,0.27


In [13]:
gpt.head()

Unnamed: 0,user_id,age,gender,state,city,tier,product_1,category_1,brand_1,price_1,...,product_9,category_9,brand_9,price_9,recommendation_score_9,product_10,category_10,brand_10,price_10,recommendation_score_10
0,USER_000001,36,Female,Karnataka,Bangalore,Tier1,Patanjali Snack,Grocery,Patanjali,184.33,...,LG Home Decor,Home,LG,12018.5,0.72,LG Kitchen Appliance,Home,LG,441.51,0.99
1,USER_000002,44,Male,Maharashtra,Mumbai,Tier1,Pigeon Furniture,Home,Pigeon,1265.69,...,Parker Academic Book,Books,Parker,10077.31,0.2,Levi's Ethnic Wear,Fashion,Levi's,1796.45,0.44
2,USER_000003,18,Female,Uttar Pradesh,Kanpur,Tier2,LG Home Decor,Home,LG,890.21,...,Reynolds Children's Book,Books,Reynolds,525.68,0.47,HP Camera,Electronics,HP,2645.0,0.88
3,USER_000004,19,Male,Uttar Pradesh,Kanpur,Tier2,Lenovo Laptop,Electronics,Lenovo,2688.1,...,Apple Tablet,Electronics,Apple,1941.58,0.21,Decathlon Sportswear,Sports,Decathlon,3477.91,0.48
4,USER_000005,34,Male,Maharashtra,Pune,Tier1,OnePlus Smart Watch,Electronics,OnePlus,8000.67,...,H&M Kurta,Fashion,H&M,794.01,0.67,Dell Camera,Electronics,Dell,9277.44,0.11


In [14]:
grok.head()

Unnamed: 0,user_id,age,gender,state,city,tier,product_1,category_1,brand_1,price_1,...,product_9,category_9,brand_9,price_9,recommendation_score_9,product_10,category_10,brand_10,price_10,recommendation_score_10
0,USER_000001,38,Male,Tamil Nadu,Chennai,Tier1,Nike Dress,Fashion,Nike,10170.52,...,Allen Solly Dress,Fashion,Allen Solly,4192.94,0.23,Fabfurnish Bedding,Home,Fabfurnish,2965.9,0.43
1,USER_000002,25,Male,Uttar Pradesh,Lucknow,Tier2,Sony Laptop,Electronics,Sony,77177.65,...,Parker Fiction,Books,Parker,4416.28,0.38,Home Centre Home Decor,Home,Home Centre,834.7,0.52
2,USER_000003,26,Male,Maharashtra,Mumbai,Tier1,Pigeon Home Decor,Home,Pigeon,13968.25,...,Decathlon Gym Equipment,Sports,Decathlon,72568.61,0.82,Nike Men's T-shirt,Fashion,Nike,1752.48,0.55
3,USER_000004,36,Female,Jharkhand,Ranchi,Tier3,IKEA Furniture,Home,IKEA,20.21,...,H&M Dress,Fashion,H&M,4318.56,0.23,Godrej Home Decor,Home,Godrej,4070.33,0.73
4,USER_000005,38,Male,Jharkhand,Ranchi,Tier3,Home Centre Bedding,Home,Home Centre,627.49,...,Penguin Academic,Books,Penguin,13507.18,0.73,IKEA Storage Solution,Home,IKEA,533.56,0.4


## Data Transformations

In [15]:
import pandas as pd
from pathlib import Path

RAW_CSV = Path("grok_recommendation_dataset.csv")
OUT_DIR  = Path("refactored")
OUT_DIR.mkdir(exist_ok=True)

# ----------------------------
# 1. Read raw data
# ----------------------------
df = pd.read_csv(RAW_CSV)

# ----------------------------
# 2. Build the USERS table
# ----------------------------
users = (
    df[["user_id", "age", "gender", "state", "city", "tier"]]
    .drop_duplicates()
    .assign(
        age_group=lambda d: pd.cut(
            d["age"],
            bins=[0, 25, 35, 45, 60, 120],
            labels=["18-25", "26-35", "36-45", "46-60", "60+"],
            right=True,
        )
    )
)
users.to_csv(OUT_DIR / "users.csv", index=False)

# ----------------------------
# 3. Melt product columns → long form
# ----------------------------
long_parts = []
for i in range(1, 11):  # the 10 recommended items per user
    long_parts.append(
        df[
            [
                "user_id",
                f"product_{i}",
                f"category_{i}",
                f"brand_{i}",
                f"price_{i}",
                f"recommendation_score_{i}",
            ]
        ].rename(
            columns={
                f"product_{i}": "product_name",
                f"category_{i}": "category",
                f"brand_{i}": "brand",
                f"price_{i}": "price",
                f"recommendation_score_{i}": "rec_score",
            }
        ).assign(ranking_position=i)
    )

long_df = pd.concat(long_parts, ignore_index=True)

# ----------------------------
# 4. Build the PRODUCTS table
# ----------------------------
products = (
    long_df[["product_name", "category", "brand", "price"]]
    .drop_duplicates()
    .reset_index(drop=True)
    .assign(product_id=lambda d: d.index + 1)  # simple surrogate key
)

products.to_csv(OUT_DIR / "products.csv", index=False)

# ----------------------------
# 5. Build the INTERACTIONS table
# ----------------------------
interactions = (
    long_df.merge(products, on=["product_name", "category", "brand", "price"])
    .loc[:, ["user_id", "product_id", "ranking_position", "rec_score"]]
)

# Optional: add a 'feedback_type' or 'timestamp' column here.
interactions.to_csv(OUT_DIR / "interactions.csv", index=False)

print("✓ Refactoring complete. Files saved to", OUT_DIR)


✓ Refactoring complete. Files saved to refactored
