# **Generate hypothetical YouTube channel growth metrics**

In [None]:
import pandas as pd
import numpy as np 
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# set random seed 
np.random.seed(42)

# Generate dates for 5 years
start_date = datetime(2020, 1, 1)
end_date = datetime(2025, 12, 31)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

In [4]:
# initialize data with zeros
n_days = len(date_range)

data = {
    'DATE': date_range,
    'SUBSCRIBERS_GAINED': np.zeros(n_days, dtype=int),
    'SUBSCRIBERS_LOST': np.zeros(n_days, dtype=int),
    'VIEWS': np.zeros(n_days, dtype=int),
    'WATCH_HOURS': np.zeros(n_days, dtype=int),
    'LIKES': np.zeros(n_days, dtype=int),
    'SHARES': np.zeros(n_days, dtype=int),
    'COMMENTS': np.zeros(n_days, dtype=int)
}

In [5]:
# create new dataframe
df = pd.DataFrame(data)

In [6]:
# Function to generate growth
def generate_growth(start, end, days):
    return np.linspace(start, end, days)

In [7]:
# generate growth patterns 
subscribers_gained = generate_growth(1, 200, n_days)
subscribers_lost = generate_growth(0, 50, n_days)
views = generate_growth(10, 10000, n_days)
watch_hours = generate_growth(1, 1000, n_days)
likes = generate_growth(0, 500, n_days)
shares = generate_growth(0, 100, n_days)
comments = generate_growth(0, 50, n_days)

In [8]:
# Add randomness and ensure integer values
for i, col in enumerate(['SUBSCRIBERS_GAINED', 'SUBSCRIBERS_LOST', 'VIEWS', 'WATCH_HOURS', 'LIKES', 'SHARES', 'COMMENTS']):
    random_factor = np.random.normal(1, 0.1, n_days)  # Mean of 1, standard deviation of 0.1
    df[col] = np.maximum(0, (eval(col.lower()) * random_factor).astype(int))

In [10]:
# Weekend boost
weekend_mask = (df['DATE'].dt.dayofweek >= 5)
df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] = df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] * 1.5

# Seasonal variation (higher in summer)
days_in_year = 366  # Account for leap year
summer_boost = np.sin(np.linspace(0, 2*np.pi, days_in_year))
df['VIEWS'] = df['VIEWS'] * (1 + 0.2 * summer_boost[df['DATE'].dt.dayofyear - 1])

In [11]:
# Occasional viral videos (once every 2 months on average, starting from the second month)
viral_days = np.random.choice(range(30, n_days), size=11, replace=False)
df.loc[viral_days, ['VIEWS', 'LIKES', 'SHARES', 'COMMENTS']] = df.loc[viral_days, ['VIEWS', 'LIKES', 'SHARES', 'COMMENTS']] * 5

# Ensure integer values
for col in df.columns:
    if col != 'DATE':
        df[col] = df[col].astype(int)

In [12]:
# Calculate cumulative subscribers
df['TOTAL_SUBSCRIBERS'] = (df['SUBSCRIBERS_GAINED'] - df['SUBSCRIBERS_LOST']).cumsum()

# Ensure no negative values
df[df.select_dtypes(include=[np.number]).columns] = df.select_dtypes(include=[np.number]).clip(lower=0)

# Save to CSV
df.to_csv('youtube_channel_data.csv', index=False)

In [13]:
df.head()

Unnamed: 0,DATE,SUBSCRIBERS_GAINED,SUBSCRIBERS_LOST,VIEWS,WATCH_HOURS,LIKES,SHARES,COMMENTS,TOTAL_SUBSCRIBERS
0,2020-01-01,1,0,9,0,0,0,0,1
1,2020-01-02,1,0,13,1,0,0,0,2
2,2020-01-03,1,0,17,1,0,0,0,3
3,2020-01-04,1,0,45,2,0,0,0,4
4,2020-01-05,1,0,63,4,0,0,0,5
