# Causal Inference: Do Creative Features Affect CTR?

In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from econml.dml import LinearDML

In [47]:
# Load dataset
df = pd.read_csv('../../data/datasets/tiktok_ads_us_toplikes.csv')

In [58]:
df.head()

Unnamed: 0,ad_title,brand_name,cost,ctr,favorite,id,is_search,like,tag,detail_analysis,...,industry_parent.value,objective_id,objective_value,creative_features_creative_theme,creative_features_creative_concept,creative_features_format_production_style,creative_features_talent_type,creative_features_demographic_representation,creative_features_audience_focus,creative_features_campaign_objective
0,Great time killer!,Survival Game Master,2,0.01,False,7132878852058906625,True,38213,3.0,This ad is using Product Review to catch audie...,...,News & Entertainment,2,App Installs,Promotional & Offer-Based,Product demo,Native Video,Influencers,Primarily Male,Unaware Audience,App Promotion
1,Oddly satisfying game,Gameworld Master,2,0.01,False,7109275920046178305,True,14299,3.0,This ad is using Strategy Focused to catch aud...,...,News & Entertainment,2,App Installs,Product-Centric,Product demo,Native Video,,No People Featured,Unaware Audience,Awareness
2,My friend recommended me to play this game,Sugarcane Inc. Empire Tycoon,0,0.02,False,7077499601561305089,True,2427,unknown,,...,News & Entertainment,2,App Installs,Educational & Explainer,Product demo,Animation & Motion Graphics,,No People Featured,Unaware Audience,Awareness
3,download now,"Smart VPN - Fast, Stable",2,0.02,False,7176836203668783106,True,124771,3.0,This ad is using Oddly Satisfying to catch aud...,...,News & Entertainment,2,App Installs,Lifestyle & Aspirational,Before-and-after story,Animation & Motion Graphics,,Primarily Female,Unclear,Engagement
4,Play ten minutes a day to relieve stress!,Sugarcane Factory 3D,0,0.02,False,7077499507373899777,True,2739,unknown,,...,News & Entertainment,2,App Installs,Product-Centric,Product demo,Animation & Motion Graphics,,No People Featured,Unaware Audience,App Promotion


In [59]:
# Define outcome, treatment, and control features
outcome = 'ctr'
creative_features = [
    'creative_features_creative_theme',
    'creative_features_creative_concept',
    'creative_features_format_production_style',
    'creative_features_talent_type',
    'creative_features_demographic_representation',
    'creative_features_audience_focus',
    'creative_features_campaign_objective'
]
control_features = ['cost', 'industry_parent.value', 'objective_value']
df_model = df[[outcome] + creative_features + control_features].copy()
df_model = df_model[df_model[outcome].notna()]

In [60]:
df_model.head()

Unnamed: 0,ctr,creative_features_creative_theme,creative_features_creative_concept,creative_features_format_production_style,creative_features_talent_type,creative_features_demographic_representation,creative_features_audience_focus,creative_features_campaign_objective,cost,industry_parent.value,objective_value
0,0.01,Promotional & Offer-Based,Product demo,Native Video,Influencers,Primarily Male,Unaware Audience,App Promotion,2,News & Entertainment,App Installs
1,0.01,Product-Centric,Product demo,Native Video,,No People Featured,Unaware Audience,Awareness,2,News & Entertainment,App Installs
2,0.02,Educational & Explainer,Product demo,Animation & Motion Graphics,,No People Featured,Unaware Audience,Awareness,0,News & Entertainment,App Installs
3,0.02,Lifestyle & Aspirational,Before-and-after story,Animation & Motion Graphics,,Primarily Female,Unclear,Engagement,2,News & Entertainment,App Installs
4,0.02,Product-Centric,Product demo,Animation & Motion Graphics,,No People Featured,Unaware Audience,App Promotion,0,News & Entertainment,App Installs


In [51]:
# Preprocessing
categorical_features = creative_features + ['industry_parent.value', 'objective_value']
numerical_features = ['cost']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # mode for categorical
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))  # median for numeric
])

preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features),
    ('num', numerical_transformer, numerical_features)
])

In [52]:
# Transform data
X = preprocessor.fit_transform(df_model.drop(columns=[outcome]))
y = df_model[outcome].values

In [53]:
X.shape, y.shape

((582, 91), (582,))

In [63]:
# Extract treatment and control matrices robustly
onehot = preprocessor.named_transformers_['cat'].named_steps['onehot']
encoded_feature_names = onehot.get_feature_names_out(categorical_features)

# Find indices of creative_features in categorical_features
creative_indices = [categorical_features.index(f) for f in creative_features]

# Calculate number of one-hot columns for treatments
num_treatments = sum([len(onehot.categories_[i]) for i in creative_indices])

T = X[:, :num_treatments]
W = X[:, num_treatments:]

In [64]:
T.shape, W.shape

((582, 66), (582, 25))

In [65]:
missing = [col for col in creative_features if col not in df_model.columns]
print("Missing creative_features columns:", missing)

Missing creative_features columns: []


In [66]:
# Remove rows with missing values in outcome, treatment, or controls
required_cols = [outcome, treatment_col] + [col for col in df_model.columns if col not in [outcome, treatment_col]]
df_model_clean = df_model.dropna(subset=required_cols)

y = df_model_clean[outcome].values
T = df_model_clean[[treatment_col]].values
W = df_model_clean[[col for col in df_model_clean.columns if col not in [outcome, treatment_col]]].values

# Fit Linear DML
model_y = RandomForestRegressor(n_estimators=100, min_samples_leaf=10)
model_t = RandomForestClassifier(n_estimators=100, min_samples_leaf=10)
dml = LinearDML(model_y=model_y, model_t=model_t, discrete_treatment=True, cv=3, random_state=0)

dml.fit(Y=y, T=T, X=W)

The least populated class in y has only 2 members, which is less than n_splits=3.


ValueError: could not convert string to float: 'Product demo'

In [None]:
# Summarize effects
import matplotlib.pyplot as plt
effects = pd.DataFrame({
    'treatment': encoded_feature_names[:num_treatments],
    'coef': dml.coef_,
    'stderr': dml.stderr_,
    'ci_lower': dml.coef_ - 1.96 * dml.stderr_,
    'ci_upper': dml.coef_ + 1.96 * dml.stderr_
})
effects.sort_values(by='coef', ascending=False).head(10)