<a href="https://colab.research.google.com/github/cbonnin88/EDA_Projects/blob/main/Streamflix_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import polars as pl
import plotly.express as px
import numpy as np
import random

# **Data Cleaning**

In [3]:
df_streamflix_dirty = pl.read_csv('streamflix_raw_signups.csv')

In [4]:
df_streamflix_dirty.head()

user_id,full_name_raw,email_raw,plan_type,minutes_watched
str,str,str,str,i64
"""USR-1000""",""" sarah doe ""","""emily0@gmail.com""","""Basic""",860
"""USR-1001""",""" john TAYLOR ""","""mailto:sarah.1@gmail.com""","""Premium""",3772
"""USR-1002""",""" mIkE SMITH ""","""john2@yahoo.com""","""Premium""",3092
"""USR-1003""",""" JANE SMITH ""","""david3@gmail.com""","""Premium""",466
"""USR-1004""",""" sarah doe ""","""mailto:john.4@gmail.com""","""Basic""",4426


In [5]:
ref_data = pl.DataFrame({
    'plan_type':['Basic','Premium','Family'],
    'plan_price':[5,10,25],
    'customer_support_calls': np.random.randint(0,20, size=3),
    'churned': np.random.choice(['Churned','Retained'], size=3)
})

In [6]:
df_streamflix_clean = (
    df_streamflix_dirty
    .with_columns([
      pl.col('full_name_raw')
        .str.replace_all(r'\n','') # Remove newlines
        .str.strip_chars() # Remove outer spaces
        .str.replace_all(r'\s+',' ') # Fix double spaces inside
        .str.to_titlecase() # Capitalize
        .alias('clean_name'),

      pl.col('email_raw')
        .str.replace('mailto:','')
        .alias('clean_email'),

      pl.col('user_id').str.slice(0,3).alias('id_prefix'),
      pl.col('user_id').str.slice(-4).alias('id_number'),
    ])
    .with_columns([
        pl.col('clean_name').str.split(' ').list.get(0).alias('first_name')
    ])
    .with_columns([
        pl.format('Hello {}, thanks for joining', pl.col('first_name')).alias('greeting_message')
    ])
    .join(ref_data, on='plan_type',how='left')
    .with_columns([
        (pl.col('plan_price') * 12).alias('annual_revenue')
    ])
)

In [7]:
df_streamflix_clean = df_streamflix_clean.select([
    'user_id',
    'clean_name',
    'clean_email',
    'plan_type',
    'minutes_watched',
    'plan_price',
    'annual_revenue',
    'customer_support_calls',
    'churned'
])

display(df_streamflix_clean.head())

user_id,clean_name,clean_email,plan_type,minutes_watched,plan_price,annual_revenue,customer_support_calls,churned
str,str,str,str,i64,i64,i64,i64,str
"""USR-1000""","""Sarah Doe""","""emily0@gmail.com""","""Basic""",860,5,60,4,"""Retained"""
"""USR-1001""","""John Taylor""","""sarah.1@gmail.com""","""Premium""",3772,10,120,5,"""Churned"""
"""USR-1002""","""Mike Smith""","""john2@yahoo.com""","""Premium""",3092,10,120,5,"""Churned"""
"""USR-1003""","""Jane Smith""","""david3@gmail.com""","""Premium""",466,10,120,5,"""Churned"""
"""USR-1004""","""Sarah Doe""","""john.4@gmail.com""","""Basic""",4426,5,60,4,"""Retained"""


In [8]:
df_streamflix_clean.write_csv('streamflix_final_report.csv')

# **EDA with Plotly**

In [9]:
# Creating a Product Metric: Engagement Score
df_streamflix_clean = df_streamflix_clean.with_columns([
    ((pl.col('minutes_watched')/ 100)- (pl.col('customer_support_calls')* 2)).alias('engagement_score')
])

In [10]:
df_streamflix_clean.head()

user_id,clean_name,clean_email,plan_type,minutes_watched,plan_price,annual_revenue,customer_support_calls,churned,engagement_score
str,str,str,str,i64,i64,i64,i64,str,f64
"""USR-1000""","""Sarah Doe""","""emily0@gmail.com""","""Basic""",860,5,60,4,"""Retained""",0.6
"""USR-1001""","""John Taylor""","""sarah.1@gmail.com""","""Premium""",3772,10,120,5,"""Churned""",27.72
"""USR-1002""","""Mike Smith""","""john2@yahoo.com""","""Premium""",3092,10,120,5,"""Churned""",20.92
"""USR-1003""","""Jane Smith""","""david3@gmail.com""","""Premium""",466,10,120,5,"""Churned""",-5.34
"""USR-1004""","""Sarah Doe""","""john.4@gmail.com""","""Basic""",4426,5,60,4,"""Retained""",36.26


# **The 'Churn' Scatter Plot**

In [16]:
fig_churn = px.scatter(
    df_streamflix_clean.to_pandas(),
    x='minutes_watched',
    y='customer_support_calls',
    color='churned',
    title='StreamFlix: Impact of Support Calls on Churn',
    color_discrete_map={'Churned':'red','Retained':'blue'},
    opacity=0.6
)

fig_churn.update_layout(xaxis_title='Minutes Watched',yaxis_title='Customer Support Calls')
fig_churn.show()

# **The "Engagement" Box Plot**

**The Business Question:** "Which subscription plan has the healthiest engagement score? Are Family plan users actually watching more?

In [12]:
fig_engagement = px.box(
    df_streamflix_clean.to_pandas(),
    x='plan_type',
    y='engagement_score',
    color='plan_type',
    title='Engagement Score Distribution by Plan',
    points='all'
)

fig_engagement.update_layout(xaxis_title='Subscription Plan', yaxis_title='Engagement Score')
fig_engagement.show()

# **The Correlation Heatmap**

**The Business Question:** "What metrics are actually related to each other?"

In [13]:
# 1. Select only numeric columns
numeric_df = df_streamflix_clean.select([
    pl.col('minutes_watched'),
    pl.col('customer_support_calls'),
    pl.col('engagement_score')
])

In [14]:
# 2. Calculate Correlation Matrix
corr_matrix = numeric_df.corr()

In [15]:
fig_corr = px.imshow(
    corr_matrix.to_numpy(),
    x=numeric_df.columns,
    y=numeric_df.columns,
    text_auto=True,
    title='Correlation Heatmap',
    color_continuous_scale='Viridis'
)

fig_corr.show()