<a href="https://colab.research.google.com/github/cbonnin88/StreamZone-TV/blob/main/Premium_lift_StreamingTV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import polars as pl
import numpy as np
import plotly.express as px
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# **Created the Dataset**

In [2]:
np.random.seed(42)

In [3]:
def generate_streaming_data(n_users=10000):
  # 1. Generate User IDs
  user_ids = np.arange(1,n_users + 1)

  # 2. Simulate User Features
  ages = np.random.randint(18,70,size=n_users)
  devices = np.random.choice(['Mobile','Tablet','Smart TV','Web'], size=n_users, p=[0.4,0.1,0.3,0.2])
  weekly_hours = np.random.exponential(scale=5,size=n_users).round(1)
  replay_count = np.random.poisson(lam=2,size=n_users)

  # 3. Define the Target Variable
  prob_subscribe = 0.05 + (weekly_hours / 40) + (replay_count / 20)
  prob_subscribe = np.where(devices == 'Smart TV', prob_subscribe + 0.15,prob_subscribe)
  prob_subscribe = np.clip(prob_subscribe, 0, 1)
  subscribed = np.random.binomial(1, prob_subscribe)

  # Create Polars DataFarme
  df_streaming = pl.DataFrame({
      'user_id': user_ids,
      'age': ages,
      'device_type': devices,
      'weekly_watch_hours': weekly_hours,
      'replay_count': replay_count,
      'converted_to_premium': subscribed
  })

  return df_streaming

In [4]:
df_streaming = generate_streaming_data()
print('Polars Dataset Created!')
display(df_streaming.head())

Polars Dataset Created!


user_id,age,device_type,weekly_watch_hours,replay_count,converted_to_premium
i64,i64,str,f64,i64,i64
1,56,"""Smart TV""",25.6,1,1
2,69,"""Mobile""",3.9,3,1
3,46,"""Mobile""",6.2,1,0
4,32,"""Web""",0.3,3,0
5,60,"""Mobile""",5.0,2,0


# **Product EDA**

# Calculating the Conversion rate per device type

In [8]:
device_stats = (
    df_streaming.group_by('device_type')
    .agg([
        pl.col('converted_to_premium').mean().alias('conversion_rate'),
        pl.col('weekly_watch_hours').mean().alias('avg_watch_time')
    ])
    .sort('conversion_rate',descending=True)
)

display(device_stats)

device_type,conversion_rate,avg_watch_time
str,f64,f64
"""Smart TV""",0.422405,5.17
"""Web""",0.283217,4.99
"""Mobile""",0.271823,5.16
"""Tablet""",0.261815,5.08


In [12]:
fig_streaming = px.bar(
    device_stats,
    x='device_type',
    y='conversion_rate',
    color='avg_watch_time',
    title='Conversion Rate by Device (Color = Avg Watch Time)',
    labels={'conversion_rate':'Premium Conversion Rate','device_type':'Devices'},
    text_auto='.1%'
)

fig_streaming.show()

# **Building the ML Pipeline**

In [13]:
# Selecting the Feature and Target
# In Polars, I select the columns using expressions
X_pl = df_streaming.select(['age','device_type','weekly_watch_hours','replay_count'])
y_pl = df_streaming.select('converted_to_premium')

In [14]:
# Converting to Pandas because Scikit-Learn is more compatible
X = X_pl.to_pandas()
y = y_pl.to_pandas().values.ravel() # ravel() flattens the array

In [15]:
# Splitting the Data
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [16]:
# Defining the Pipeline
categorical_features = ['device_type']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat',categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('claissifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [17]:
# Training
model.fit(X_train,y_train)
print('Model Trained')

Model Trained


# **Product Action**

In [20]:
# Here we will imagine 3 new users
new_users_pl = pl.DataFrame({
    'age':[25,45,30],
    'device_type':['Mobile','Smart TV','Web'],
    'weekly_watch_hours':[2.5,15.0,1.0],
    'replay_count':[0,8,0]
})

display(new_users_pl)

age,device_type,weekly_watch_hours,replay_count
i64,str,f64,i64
25,"""Mobile""",2.5,0
45,"""Smart TV""",15.0,8
30,"""Web""",1.0,0


In [21]:
# Predicting Probabilities
# I have to convert to pandas temporarily for the model prediction
probs = model.predict_proba(new_users_pl.to_pandas())[:,1]

In [23]:
# Adding probabilities back to the Polars DataFrame and applying the business logic
final_df = (
    new_users_pl
    .with_columns(pl.Series(name='propensity_score',values=probs))
    .with_columns(
        pl.when(pl.col('propensity_score') > 0.7)
        .then(pl.lit('High Intent: Show Premium Checkout'))
        .when(pl.col('propensity_score') > 0.4)
        .then(pl.lit('Medium Intent: Send 10% Discount'))
        .otherwise(pl.lit('Low Intent: Keep Free'))
        .alias('recommended_action')
    )
)
print('--- Final Product Decisions (Polars)')
display(final_df)

--- Final Product Decisions (Polars)


age,device_type,weekly_watch_hours,replay_count,propensity_score,recommended_action
i64,str,f64,i64,f64,str
25,"""Mobile""",2.5,0,0.4,"""Low Intent: Keep Free"""
45,"""Smart TV""",15.0,8,0.77,"""High Intent: Show Premium Chec…"
30,"""Web""",1.0,0,0.28,"""Low Intent: Keep Free"""


# **Deploying the Model**

In [25]:
# Saving the Model
model_filename = 'StreamZone_premium_prediction_v1.pkl'

In [26]:
joblib.dump(model,model_filename)

['StreamZone_premium_prediction_v1.pkl']

In [27]:
print(f'Model saved as {model_filename}')

Model saved as StreamZone_premium_prediction_v1.pkl


# **Loading and Serving the Product Script**

In [28]:
# Loading the model from the file

loaded_pipeline = joblib.load('StreamZone_premium_prediction_v1.pkl')
print('Model Loaded from disk')

Model Loaded from disk


In [30]:
# A New users opens the app (Live Data)
live_user_data = pl.DataFrame({
    'age':[37],
    'device_type':'Smart TV',
    'weekly_watch_hours':[13.7],
    'replay_count':[6]
})

display(live_user_data)

age,device_type,weekly_watch_hours,replay_count
i64,str,f64,i64
37,"""Smart TV""",13.7,6


In [31]:
# Pre-processing bridge (Polars -> Pandas)
input_data = live_user_data.to_pandas()

In [32]:
# Generating the prediction
probability = loaded_pipeline.predict_proba(input_data)[0,1]

In [33]:
# Real-time Decision
threshold = 0.6
decision = 'Offer Free Trial' if probability > threshold else 'Standard Ad Experience'

In [34]:
print(f'\nUser Device: {live_user_data['device_type'][0]}')
print(f'Propensity Score: {probability:.2f}')
print(f'Server Decision: {decision}')


User Device: Smart TV
Propensity Score: 0.52
Server Decision: Standard Ad Experience


# **Batch Marketing Simulation**

In [43]:
# Generating 5000 Fresh Leads

np.random.seed(99)
n_leads = 5000

In [44]:
leads_df = pl.DataFrame({
    'user_id': np.arange(20000,20000 + n_leads),
    'age': np.random.randint(18,70,size=n_leads),
    'device_type': np.random.choice(['Mobile','Tablet','Smart TV','Web'], size=n_leads, p=[0.4,0.1,0.3,0.2]),
    'weekly_watch_hours': np.random.exponential(scale=4.5,size=n_leads).round(1),
    'replay_count': np.random.poisson(lam=1.5,size=n_leads)
})

display(leads_df)

user_id,age,device_type,weekly_watch_hours,replay_count
i64,i64,str,f64,i64
20000,19,"""Smart TV""",8.9,1
20001,53,"""Tablet""",1.7,2
20002,58,"""Mobile""",1.6,1
20003,27,"""Mobile""",4.0,1
20004,58,"""Smart TV""",0.7,2
…,…,…,…,…
24995,39,"""Mobile""",4.7,3
24996,54,"""Mobile""",0.3,2
24997,62,"""Tablet""",8.4,2
24998,48,"""Smart TV""",11.1,1


In [45]:
# Saving to CSV (This is a simulation of a file I could receive frm the Data Team)
leads_df.write_csv('marketing_leads.csv')
print('marketing_leads.csv has been created with 5000 users')

marketing_leads.csv has been created with 5000 users


In [63]:
def run_batch_prediction(input_csv_path, model_path, output_csv_path):
  print(f'Starting batch job for {input_csv_path}...')

  # 1. Load Data
  df_marketing = pl.read_csv(input_csv_path)

  # 2. Load Model
  model= joblib.load(model_path)

  # 3. Predict
  # Using pandas
  probs = model.predict_proba(df_marketing.to_pandas())[:, 1]

  # 4. Add Score & Filter
  result_df = (
      df_marketing
      .with_columns(pl.Series(name='propensity_score',values=probs))
      .filter(pl.col('propensity_score') > 0.6) # The Marketing Cutoff
      .sort('propensity_score', descending=True)
  )
  # 5. Save to CSV
  result_df.write_csv(output_csv_path)

  print(f'Done! Found {len(result_df)} high-value targets out of {len(df_marketing)}')
  print(f'Saved to {output_csv_path}')

In [64]:
run_batch_prediction('marketing_leads.csv','StreamZone_premium_prediction_v1.pkl','high_value_targets.csv')

Starting batch job for marketing_leads.csv...
Done! Found 615 high-value targets out of 5000
Saved to high_value_targets.csv


In [66]:
# Verifying the result
print('\n--- Preview of High Value Targets ---')
display(pl.read_csv('high_value_targets.csv').head())


--- Preview of High Value Targets ---


user_id,age,device_type,weekly_watch_hours,replay_count,propensity_score
i64,i64,str,f64,i64,f64
24752,45,"""Smart TV""",30.3,0,1.0
20248,43,"""Smart TV""",21.6,4,0.99
24763,46,"""Smart TV""",24.6,3,0.99
24676,25,"""Smart TV""",0.6,2,0.98
24957,26,"""Smart TV""",6.4,1,0.98


# **Revenue Analysis**

In [67]:
# Loading the targets
df_targets = pl.read_csv('high_value_targets.csv')

In [68]:
# Defining Economics
subscription_price = 9.99 # Monthly price

In [70]:
# Calculating Metrics
# Risk-Adjusted = Sum of (Score * Price)
revenue_stats = df_targets.select([
    pl.len().alias('num_targets'),
    pl.col('propensity_score').mean().alias('avg_confidence'),
    (pl.col('propensity_score') * subscription_price).sum().alias('expected_revenue'),
    (pl.len() * subscription_price).alias('max_potential_revenue')
])

In [71]:
print('--- Campaign Forecast ---')
display(revenue_stats)

--- Campaign Forecast ---


num_targets,avg_confidence,expected_revenue,max_potential_revenue
u32,f64,f64,f64
615,0.733258,4505.026654,6143.85


In [73]:
marketing_revenue = px.histogram(
    df_targets.to_pandas(),
    x='propensity_score',
    nbins=20,
    title='Distribution of High-Value Users',
    labels={'propensity_score':'Propensity Score (Probability)'},
    color_discrete_sequence= ['#00CC96']
)

marketing_revenue.add_vline(x=0.6,line_dash='dash',line_color='red',annotation_text='Selection Cutoff')

marketing_revenue.show()