<a href="https://colab.research.google.com/github/cbonnin88/VifStream/blob/main/VifStream_data_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faker

Collecting faker
  Downloading faker-40.1.2-py3-none-any.whl.metadata (16 kB)
Downloading faker-40.1.2-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m1.1/2.0 MB[0m [31m34.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-40.1.2


In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# **Configuration**

In [None]:
fake = Faker(['fr_Fr']) # just to give it a french vibe
num_users = 50000 # 50k unique users
num_events = 1000000 # 1 million behaviorl events
num_subs = 15000 # 15k subscription transactions

print('Generating VifStream Dataset')

Generating VifStream Dataset


# **User Data**

In [None]:
users = pd.DataFrame({
    'user_id': np.arange(1,num_users+1),
    'signup_date': [fake.date_between(start_date='-2y',end_date='today') for _ in range(num_users)],
    'plan_type': np.random.choice(['free','basic','premium'], num_users, p=[0.7,0.2,0.1]),
    'region': np.random.choice(['ile-de-France','Hauts-de-France','Occitania','Provence Alpes-Côte dAzur','Nouvelle-Aquitiaine','Pays de La Loire','Brittany','Normandy','Centre-Val de Loire','Grand Est','Bourgogne-Franche-Comté','Avuergne-Rhône-Alpes'],num_users),
    'device': np.random.choice(['Mobile','Web','SmartTV','AppleTV'], num_users, p=[0.5,0.1,0.2,0.2]),
    'age_group': np.random.choice(['18-24','25-34','35-44','45+'], num_users)
})

# **Events Data**

In [None]:
# Randomly pick users ID's for events
event_user_ids = np.random.randint(1,num_users + 1, size=num_events)

In [None]:
event_types = ['app_open','search','channel_tune_in','paywall_view','add_to_favorites']
event_p = [0.2,0.15,0.5,0.1,0.05] # Probability Distribution

In [None]:
events = pd.DataFrame({
    'event_id': np.arange(1,num_events+1),
    'user_id': event_user_ids,
    'event_name': np.random.choice(event_types,num_events,p=event_p),
    'timestamp': [fake.date_time_between(start_date='-1y',end_date='now') for _ in range(num_events)],
    'session_id': np.random.randint(100000, 999999, size=num_events)
})

# **Subscriptions Data**

In [None]:
# Only users with 'basic' or 'premium' plans should have transactions
paying_users = users[users['plan_type'] != 'free']['user_id'].values
sub_user_ids = np.random.choice(paying_users, num_subs)

In [None]:
subscriptions = pd.DataFrame({
    'transaction_id': np.arange(1,num_subs + 1),
    'user_id': sub_user_ids,
    'amount': np.random.choice([9.99,15.99], num_subs), # Standard and Premium prices
    'status': np.random.choice(['completed','failed','refunded'], num_subs, p=[0.9,0.08,0.02]),
    'payment_method': np.random.choice(['Visa','Mastercard','PayPal','Apple Pay'], num_subs),
    'transaction_date': [fake.date_time_between(start_date='-1y',end_date='now') for _ in range(num_subs)]
})

# **Saving to CSV**

In [None]:
users.to_csv('vifstream_raw_users.csv',index=False)
events.to_csv('vifstream_raw_events.csv',index=False)
subscriptions.to_csv('vifstream_raw_subscriptions.csv',index=False)

print(f"✅ Success! Files saved:")
print(f"- vfstream_users.csv ({len(users)} rows)")
print(f"- vfstream_events.csv ({len(events)} rows)")
print(f"- vfstream_subscriptions.csv ({len(subscriptions)} rows)")

✅ Success! Files saved:
- vfstream_users.csv (50000 rows)
- vfstream_events.csv (1000000 rows)
- vfstream_subscriptions.csv (15000 rows)
