In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [32]:
url = "https://raw.githubusercontent.com/annabiloshevska/marketing_campaign_performance/refs/heads/master/data/processed/marketing_campaign_clean.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Campaign_ID,Company,Campaign_Type,Target_Audience,Duration,Channel_Used,Conversion_Rate,Acquisition_Cost,ROI,Location,Language,Clicks,Impressions,Engagement_Score,Customer_Segment,Date,Duration_num,CTR,CPC,CPM
0,1,Innovate Industries,Email,Men 18-24,30 days,Google Ads,0.04,16174.0,6.29,Chicago,Spanish,506,1922,6,Health & Wellness,2021-01-01,30.0,0.263267,31.964427,8415.192508
1,2,NexGen Systems,Email,Women 35-44,60 days,Google Ads,0.12,11566.0,5.61,New York,German,116,7523,7,Fashionistas,2021-01-02,60.0,0.015419,99.706897,1537.418583
2,3,Alpha Innovations,Influencer,Men 25-34,30 days,YouTube,0.07,10200.0,7.18,Los Angeles,French,584,7698,1,Outdoor Adventurers,2021-01-03,30.0,0.075864,17.465753,1325.019486
3,4,DataTech Solutions,Display,All Ages,60 days,YouTube,0.11,12724.0,5.55,Miami,Mandarin,217,1820,7,Health & Wellness,2021-01-04,60.0,0.119231,58.635945,6991.208791
4,5,NexGen Systems,Email,Men 25-34,15 days,YouTube,0.05,16452.0,6.5,Los Angeles,Mandarin,379,4201,3,Health & Wellness,2021-01-05,15.0,0.090217,43.408971,3916.210426


In [33]:
#Defining features and targets
feature_cols = [
   "Engagement_Score",
   "Acquisition_Cost",
   "Channel_Used",
   "Duration_num",
]

X = df[feature_cols].copy()
y = df["ROI"]

In [34]:
#Preprocessing: encode categorical
le = LabelEncoder()
X["Channel"] = le.fit_transform(X["Channel_Used"])

print("Encoding map:")
for i, channel in enumerate(le.classes_):
    print(f"{i} = '{channel}'")

X_final = X[["Engagement_Score", "Acquisition_Cost", "Channel", "Duration_num"]]

Encoding map:
0 = 'Email'
1 = 'Facebook'
2 = 'Google Ads'
3 = 'Instagram'
4 = 'Website'
5 = 'YouTube'


In [35]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

In [36]:
#Creating a model
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

#Training
model.fit(X_train, y_train)
print("Model training complete!")

Model training complete!


In [37]:
# Results
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RESULTS:")
print(f"MAE : {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"RÂ²  : {r2:.3f}")


RESULTS:
MAE : 1.503
RMSE: 1.736
RÂ²  : -0.001


In [38]:
# Feature importance
features = ["Engagement", "Cost", "Channel", "Duration"]
for name, imp in zip(features, model.feature_importances_):
    print(f"{name}: {imp:.3f}")

Engagement: 0.186
Cost: 0.571
Channel: 0.141
Duration: 0.102


In [39]:
# ROI Prediction Function (for new campaigns)
def predict_roi(campaign_dict):
    """
    Predict ROI for new campaign

    Example: predict_roi({'Engagement_Score': 7, 'Acquisition_Cost': 15000,
                          'Channel': 0, 'Duration_num': 30})
    """
    input_df = pd.DataFrame([campaign_dict])
    prediction = model.predict(input_df)[0]
    return prediction


In [40]:
#Example Prediction
example_campaign = {
    'Engagement_Score': 7,
    'Acquisition_Cost': 15000,
    'Channel': 0,
    'Duration_num': 30
}

predicted_roi = predict_roi(example_campaign)
print(f"\nðŸ’¼ Predicted ROI: {predicted_roi:.2f}")


ðŸ’¼ Predicted ROI: 5.00
