In [1]:
import mysql.connector
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import sys
import sqlalchemy
from sqlalchemy import create_engine, Table, Column, Integer, Float, String, Text, Date, DateTime, Boolean, MetaData

def get_data(cursor, table_name, start_date=None, end_date=None):
    # If no dates provided, fetch all
    if not start_date and not end_date:
        query = f"SELECT * FROM {table_name}"
    else:
        # Default to 10-day window if only one date is missing
        if not end_date:
            end_date = datetime.now()
        if not start_date:
            start_date = end_date - timedelta(days=10)

        # Ensure strings
        if isinstance(start_date, datetime):
            start_str = start_date.strftime('%Y-%m-%d')
        else:
            start_str = str(start_date)

        if isinstance(end_date, datetime):
            end_str = end_date.strftime('%Y-%m-%d')
        else:
            end_str = str(end_date)

        query = f"""
            SELECT * FROM {table_name}
            WHERE report_date >= '{start_str}' AND report_date <= '{end_str}'
        """

    cursor.execute(query)
    column_names = [desc[0] for desc in cursor.description]
    rows = cursor.fetchall()

    df = pd.DataFrame(rows, columns=column_names)
    return df

def split_and_assign(df, column, column_type):
    # Define expected columns inside the function
    expected_columns_map = {
        "campaign_name": ['camp_code', 'platform', 'camp_type', 'buying_type', 'free_text1', 'year', 'optimization', 'buying_model', 'placement', 'freetext_2'],
        "adgroup_name": ['noneed_1', 'noneed_2', 'noneed_3', 'noneed_4', 'noneed_5', 'noneed_6', 'noneed_7', 'noneed_8', 'bidding_strategy', 'noneed_9'],
        "ad_name": ['creative_type', 'creative_format', 'creative_length', 'noneed_9', 'noneed_10', 'noneed_11', 'noneed_12', 'noneed_13', 'freetext_3'],
    }

    expected_columns = expected_columns_map.get(column_type, [])

    if column in df.columns:
        df[column] = df[column].fillna("").astype(str)
        split_cols = df[column].str.split('_', expand=True)

        # Pad with NaNs if needed
        num_actual_cols = split_cols.shape[1]
        num_expected_cols = len(expected_columns)
        if num_actual_cols < num_expected_cols:
            for i in range(num_actual_cols, num_expected_cols):
                split_cols[i] = pd.NA
        num_actual_cols = split_cols.shape[1]

        assigned_names = expected_columns[:num_actual_cols]
        extra_names = [f"extra_{i}" for i in range(num_actual_cols - len(assigned_names))]
        all_new_names = assigned_names + extra_names
        all_new_names = [f"{name}_split" if name in df.columns else name for name in all_new_names]

        split_cols.columns = all_new_names
        df = df.join(split_cols)

    # Split 'camp_code' into components
    if "camp_code" in df.columns:
        df["camp_code"] = df["camp_code"].fillna("").astype(str)
        campaign_split = df["camp_code"].str.split("-", expand=True)

        max_cols = 6
        campaign_split = campaign_split.iloc[:, :max_cols]
        for i in range(campaign_split.shape[1], max_cols):
            campaign_split[i] = np.nan

        campaign_split.columns = ["Funnel", "Category", "Brand Name", "Campaign Name", "Start Date", "End Date"]
        df = df.drop(columns=[col for col in campaign_split.columns if col in df.columns], errors="ignore")
        df = df.join(campaign_split)

    # Split 'free_text1' into components
    if "free_text1" in df.columns:
        df["free_text1"] = df["free_text1"].fillna("").astype(str)
        freetext_split = df["free_text1"].str.split("-", expand=True)

        max_cols = 6
        freetext_split = freetext_split.iloc[:, :max_cols]
        for i in range(freetext_split.shape[1], max_cols):
            freetext_split[i] = np.nan

        freetext_split.columns = ["Audience", "Gender", "Age", "Region", "Device", "Os"]
        df = df.drop(columns=[col for col in freetext_split.columns if col in df.columns], errors="ignore")
        df = df.join(freetext_split)

    # Create TA column
    if "Gender" in df.columns and "Age" in df.columns:
        df["Gender"] = df["Gender"].astype(str)
        df["Age"] = df["Age"].astype(str).str.zfill(4)
        df["TA"] = df["Gender"] + df["Age"].str[:2] + "-" + df["Age"].str[2:]

    # Format Start and End Dates
    if "Start Date" in df.columns and "End Date" in df.columns and "year" in df.columns:
        df["year"] = df["year"].astype(str)

        def format_date(row, col):
            if pd.notna(row[col]) and len(row[col]) == 4:
                return f"{row['year'][:2]}{row[col][:2]}-{row[col][2:]}"
            return None

        df["Start Date"] = df.apply(lambda row: format_date(row, "Start Date"), axis=1)
        df["End Date"] = df.apply(lambda row: format_date(row, "End Date"), axis=1)

        df["Start Date"] = pd.to_datetime(df["Start Date"], format="%Y-%m-%d", errors="coerce")
        df["End Date"] = pd.to_datetime(df["End Date"], format="%Y-%m-%d", errors="coerce")

    # Create ad_format column
    if all(col in df.columns for col in ["creative_type", "creative_format", "creative_length"]):
        df["ad_format"] = df["creative_type"].astype(str) + "_" + df["creative_format"].astype(str) + "_" + df["creative_length"].astype(str)

    # Drop unnecessary columns
    df = df.drop(columns=[col for col in df.columns if "noneed" in col], errors="ignore")

    return df


def upload_to_mysql(df, table_name, engine, if_exists='append'):
    """
    Uploads a DataFrame to a MySQL table with correct column types based on the DataFrame.

    :param df: Pandas DataFrame to upload
    :param table_name: Name of the table in the database
    :param engine: SQLAlchemy engine object
    :param if_exists: What to do if the table already exists ('fail', 'replace', 'append')
    """
    try:
        with engine.connect() as conn:
            metadata = MetaData()

            # Define table schema based on DataFrame's column types
            columns = []

            for col in df.columns:
                dtype = df[col].dtype
                
                # Map Pandas dtype to MySQL/SQLAlchemy types
                if pd.api.types.is_integer_dtype(dtype):
                    col_type = Integer
                elif pd.api.types.is_float_dtype(dtype):
                    col_type = Float
                elif pd.api.types.is_datetime64_any_dtype(dtype):
                    col_type = DateTime
                elif pd.api.types.is_bool_dtype(dtype):
                    col_type = Boolean
                elif pd.api.types.is_string_dtype(dtype):
                    col_type = Text(collation="utf8mb4_unicode_ci")  # UTF-8 support
                else:
                    col_type = String(255, collation="utf8mb4_unicode_ci")  # Default fallback

                columns.append(Column(col, col_type))

            table = Table(table_name, metadata, *columns, extend_existing=True)

            # Create the table with utf8mb4 encoding
            metadata.create_all(conn)

        # Convert object columns to string before uploading
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].astype(str)

        # Upload DataFrame to MySQL with proper type mapping
        dtype_mapping = {
            col: (
                sqlalchemy.types.Integer if pd.api.types.is_integer_dtype(df[col]) else
                sqlalchemy.types.Float if pd.api.types.is_float_dtype(df[col]) else
                sqlalchemy.types.DateTime if pd.api.types.is_datetime64_any_dtype(df[col]) else
                sqlalchemy.types.Boolean if pd.api.types.is_bool_dtype(df[col]) else
                sqlalchemy.types.Text(collation="utf8mb4_unicode_ci") if pd.api.types.is_string_dtype(df[col]) else
                sqlalchemy.types.String(255, collation="utf8mb4_unicode_ci")
            )
            for col in df.columns
        }

        df.to_sql(table_name, engine, if_exists=if_exists, index=False, dtype=dtype_mapping)

        print(f"✅ Data successfully uploaded to `{table_name}` with correct column types!")

    except Exception as e:
        print(f"❌ Error uploading data to `{table_name}`: {e}")


In [None]:
def mapping_naming(df):
    mapping_creative_type = {
        "VID": "Video",
        "IMAGE": "Image",
        "BAN": "Banner",
        "AUD": "Audio",
        "TEXT": "Text",
        "DISC": "Discovery Ad",
        "RESAD": "Responsive Ad",
        "ST": "Story",
        "GIF": "Gif",
        "CARO": "Carousel",
        "REEL": "Reel",
        "MULT": "Multiple",
        "DNM": "dynamic",
        "CLT": "collec",
        "TVC": "tvc",
        "SMS": "SMS",
        "LIVESTREAM": "livetream"
    }
    mapping_campaign_type = {
        "VRC": "View Reach Campaign",
        "VVC": "Video View Campaign",
        "PMAX": "PMAX",
        "DG": "DemandGen",
        "SEM": "SEM",
        "VAC": "Video Action Campaign",
        "REA": "Reach",
        "ENG": "Engage",
        "TRF": "Traffic",
        "LEAD": "Lead",
        "THRU": "Thruplay",
        "PDIS": "Programatic Display",
        "PVID": "Programatic Video",
        "OTT": "Programatic OTT",
        "MH": "Masthead",
        "APP": "App",
        "GDN": "GDN",
        "IS": "InStream",
        "BUMP": "Bumper",
        "TVN": "Trueview Nonskip",
        "LSA": "Livestream Ads",
        "CT": "Community Interaction",
        "PSA": "Product Shopping Ads",
        "VSA": "Video Shopping Ads"
    }
    mapping_kpi_metric = {
    "Imp": "Impression",
    "Click": "Click",
    "View": "View",
    "Reach": "Reach",
    "Engagement": "Engagement",
    "Lead Form": "Lead Form",
    "Message": "Message",
    "AppInstall": "App Install",
    "AppEngage": "App Engage"
}
    mapping_platform = {
        "DV360": "DV360",
        "DV360-DISP": "DV360 Display",
        "DV360-VID": "DV360 Video (non-YouTube)",
        "DV360-YT": "DV360 YouTube",
        "GG": "Google Ads",
        "GA-YT": "Google Ads YouTube",
        "GDN": "Google Display Network",
        "D-YT": "YouTube Direct",
        "FB": "Facebook",
        "TT": "TikTok",
        "IG": "Instagram",
        "YT": "YouTube",
    }
    mapping_campaign_funnel = {
        "AW": "Awareness",
        "CSD": "Consideration",
        "CVR": "Conversion"
    }

    if 'camp_type' in df.columns:
        df['camp_type'] = df['camp_type'].map(mapping_campaign_type)
    elif 'Format' in df.columns:
        df['Format'] = df['Format'].map(mapping_campaign_type)

    # Map Funnel
    if 'Funnel' in df.columns:
        df["Funnel"] = df["Funnel"].map(mapping_campaign_funnel)
    # Map Funnel
    if 'KPI Metric' in df.columns:
        df["KPI Metric"] = df["KPI Metric"].map(mapping_kpi_metric)
    # Map Platform
    if 'platform' in df.columns:
        df["Platform"] = df["platform"].map(mapping_platform)
    elif 'Platform' in df.columns:
        df["Platform"] = df["Platform"].map(mapping_platform)
        # Map Funnel


In [4]:
df_plan = pd.read_excel('/Users/khanhvx/Downloads/SN Media Plan.xlsx')
df_plan.columns = df_plan.columns.str.replace(' ', '_')
df_plan['Campaign_phase'] = df_plan['Campaign_phase'].replace('NA', '')
# Rename columns
df_plan = df_plan.rename(columns={
    'Unit': 'Buying_Method',
    'Objective': 'Format'
})

# Split 'freetext_2' into two new date columns
# Assumes format like "1205-2505" (DDMM-DDMM)

# Extract the parts
date_parts = df_plan['freetext_2'].str.extract(r'(?P<start>\d{4})-(?P<end>\d{4})')

# Convert to datetime (assumes year 2025)
df_plan['Actual_Start_Date'] = pd.to_datetime('2025' + date_parts['start'], format='%Y%d%m', errors='coerce')
df_plan['Actual_End_Date'] = pd.to_datetime('2025' + date_parts['end'], format='%Y%d%m', errors='coerce')
df_plan['Plan_Start_Date'] = pd.to_datetime(
    '2025' + df_plan['Plan_Start_Date'].astype(str),
    format='%Y%d%m',
    errors='coerce'
)
df_plan['Plan_End_Date'] = pd.to_datetime(
    '2025' + df_plan['Plan_End_Date'].astype(str),
    format='%Y%d%m',
    errors='coerce'
)

mapping_naming(df_plan)
df_plan

Unnamed: 0,Funnel,Campaign,Category,Brand,Plan_Start_Date,Plan_End_Date,Line_code,Format,Region,Platform,...,frequency_combine_estimate_week,er_estimate,er_bm,vtr_estimate,vtr_bm,ctr_estimate,ctr_bm,exchange_rate,Actual_Start_Date,Actual_End_Date
0,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,0.025,,0.021000,,0.00150,0.0010,26000.0,2025-05-12,2025-05-25
1,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,2.0,Reach,37C,Facebook,...,,0.025,,0.021000,,0.00150,0.0010,26000.0,2025-05-12,2025-05-25
2,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,3.0,View Reach Campaign,35C,Google Ads,...,,,,0.145000,,0.00200,0.0010,26000.0,2025-05-14,2025-05-25
3,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,,View Reach Campaign,26C,Google Ads,...,,,,0.145000,,0.00200,0.0010,26000.0,2025-05-12,2025-05-25
4,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,,View Reach Campaign,NAT,Google Ads,...,,,,0.230000,,0.00200,0.0010,26000.0,2025-05-14,2025-05-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,0.012340,0.04,0.00080,0.0017,26000.0,2025-01-03,2025-01-21
106,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,0.009538,0.04,0.00080,0.0017,26000.0,2025-01-03,2025-01-21
107,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,0.027049,0.04,0.00100,0.0017,26000.0,2025-01-18,2025-01-21
108,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,0.030000,0.04,0.00100,0.0017,26000.0,2025-01-03,2025-01-21


In [220]:
# Establish MySQL connection
conn = mysql.connector.connect(
    host="10.0.0.3",
    port = '3306',
    user="dashboard_report_vinamilk",
    password="6vtRhNxa79zPsfs5",
    database="dashboard_report_vinamilk"
)
cursor = conn.cursor()

df_tt = get_data(cursor, "tiktok_template_5")
df_yt = get_data(cursor, "googleads_performance_daily_raw_v1")
df_fb = get_data(cursor, "facebook_performance_daily_raw")
df_ga = get_data(cursor, "ga4_performance_daily_raw")
df_fb_creative = get_data(cursor, "facebook_creative")
df_gg_creative = get_data(cursor, "googleads_creative")
df_tt_creative = get_data(cursor, "tiktok_creative_preview")


if 'cursor' in locals():
    cursor.close()
if 'conn' in locals() and conn.is_connected():
    conn.close()
    print("MySQL connection closed.")

MySQL connection closed.


In [6]:
df_tt_creative

Unnamed: 0,report_date,ad_id,account_id,ad_url,ad_text,tt_preview_url,tt_preview_url_no_params,sc_preview_url,sc_preview_status
0,2025-03-11,1.826284e+15,7.451448e+18,-,DEAL NÀY 100% HOT! MUA 1 TẶNG 4 TỪ SỮA TƯƠI 10...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,,0.0
1,2025-03-11,1.826284e+15,7.451448e+18,-,DEAL NÀY 100% HOT! MUA 1 TẶNG 4 TỪ SỮA TƯƠI 10...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,,0.0
2,2025-03-11,1.826283e+15,7.451448e+18,-,DEAL NÀY 100% HOT! MUA 1 TẶNG 4 TỪ SỮA TƯƠI 10...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,,0.0
3,2025-03-11,1.826283e+15,7.451448e+18,-,DEAL NÀY 100% HOT! MUA 1 TẶNG 4 TỪ SỮA TƯƠI 10...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,,0.0
4,2025-03-11,1.826283e+15,7.451448e+18,-,DEAL NÀY 100% HOT! MUA 1 TẶNG 4 TỪ SỮA TƯƠI 10...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,,0.0
...,...,...,...,...,...,...,...,...,...
8485,2025-07-01,1.836428e+15,7.467092e+18,-,,,,,2.0
8486,2025-07-01,1.836428e+15,7.467092e+18,-,,,,,2.0
8487,2025-07-01,1.836434e+15,7.451533e+18,-,Sữa organic nhà Vinamilk khác gì sữa thường?,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,0.0
8488,2025-07-01,1.836434e+15,7.451533e+18,-,Tất tần tật về sữa tươi tiệt trùng Green Farm,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,0.0


In [221]:
df_tt1 = df_tt.copy()
df_fb1 = df_fb.copy()
df_yt1 = df_yt.copy()

In [246]:
df_tt = df_tt1.copy()
df_yt = df_yt1.copy()
df_fb = df_fb1.copy()

In [247]:
df_map_camp = pd.read_excel('/Users/khanhvx/Downloads/map sn2.xlsx', sheet_name='campaign')
df_map_ad = pd.read_excel('/Users/khanhvx/Downloads/map sn2.xlsx', sheet_name='ad')
df_map_camp = df_map_camp.drop_duplicates(subset='campaign_id', keep='last')
df_map_ad = df_map_ad.drop_duplicates(subset='ad_id', keep='last')


# Merge campaign name
df_fb = df_fb.merge(df_map_camp[['campaign_id', 'campaign_name']], on='campaign_id', how='left')
df_fb['campaign_name_y'] = df_fb['campaign_name_y'].fillna(df_fb['campaign_name_x'])
df_fb = df_fb.rename(columns={'campaign_name_y': 'campaign_name'}).drop(columns=['campaign_name_x'])

# Merge ad (dd_name) from ad_id
df_fb = df_fb.merge(df_map_ad[['ad_id', 'ad_name']], on='ad_id', how='left')
df_fb['ad_name_y'] = df_fb['ad_name_y'].fillna(df_fb['ad_name_x'])
df_fb = df_fb.rename(columns={'ad_name_y': 'ad_name'}).drop(columns=['ad_name_x'])


# Merge campaign name
df_yt = df_yt.merge(df_map_camp[['campaign_id', 'campaign_name']], on='campaign_id', how='left')
df_yt['campaign_name'] = df_yt['campaign_name'].fillna(df_yt['campaign'])
df_yt = df_yt.drop(columns=['campaign'])

# Merge ad (dd_name) from ad_id
df_yt = df_yt.merge(df_map_ad[['ad_id', 'ad_name']], on='ad_id', how='left')
df_yt['ad_name'] = df_yt['ad_name'].fillna(df_yt['ad'])
df_yt = df_yt.drop(columns=['ad'])

# Merge campaign name
df_tt = df_tt.merge(df_map_camp[['campaign_id', 'campaign_name']], on='campaign_id', how='left')
df_tt['campaign_name_y'] = df_tt['campaign_name_y'].fillna(df_tt['campaign_name_x'])
df_tt = df_tt.rename(columns={'campaign_name_y': 'campaign_name'}).drop(columns=['campaign_name_x'])

# Merge ad (dd_name) from ad_id
df_tt = df_tt.merge(df_map_ad[['ad_id', 'ad_name']], on='ad_id', how='left')
df_tt['ad_name_y'] = df_tt['ad_name_y'].fillna(df_tt['ad_name_x'])
df_tt = df_tt.rename(columns={'ad_name_y': 'ad_name'}).drop(columns=['ad_name_x'])


# # Merge ad (dd_name) from ad_id
# df_yt = df_yt.merge(df_map_ad[['ad_id', 'ad_name']], on='ad_id', how='left')
# df_yt['ad_name_y'] = df_yt['ad_name_y'].fillna(df_tt['ad_name_x'])
# df_yt = df_yt.rename(columns={'ad_name_y': 'ad_name'}).drop(columns=['ad_name_x'])

In [248]:
df_total_all = pd.concat([df_fb, df_yt, df_tt], ignore_index=True)

In [249]:
import numpy as np
import pandas as pd # Make sure pandas is imported
from datetime import datetime
import re

# ... (keep your rearrange_marker function and initial data cleaning) ...

campaign_name_expected_columns = ["Funnel", 'camp_code', 'platform', 'camp_type', 'buying_type', 'free_text1', 'year', 'KPI Metric',
                                  'buying_model', 'placement', 'Campaign phase', 'freetext_2']

adgroup_name_expected_columns = ['noneed_1', 'noneed_2', 'noneed_3', 'noneed_4', 'noneed_5', 'noneed_6', 'noneed_7',
                                 'noneed_8', 'bidding_strategy', 'noneed_9']

# Original list with 9 names
ad_name_expected_columns = ['creative_type', 'creative_format', 'creative_length', 'noneed_9', 'noneed_10', 'noneed_11', 'noneed_12',
                            'noneed_13', 'freetext_3']

def split_and_assign(df, column, expected_columns):
    if column in df.columns:
        df[column] = df[column].fillna("").astype(str)
        split_cols = df[column].str.split('_', expand=True)

        num_expected = len(expected_columns)
        num_actual = split_cols.shape[1]

        # Determine how many columns to actually use and name (max is num_expected)
        cols_to_assign = min(num_actual, num_expected)

        # Take only the relevant columns from the split (up to num_expected)
        split_cols_final = split_cols.iloc[:, :cols_to_assign]

        # Assign the corresponding expected names
        split_cols_final.columns = expected_columns[:cols_to_assign]

        # Check for existing columns before joining to avoid duplicate column errors
        cols_to_drop_before_join = [col for col in split_cols_final.columns if col in df.columns]
        df = df.drop(columns=cols_to_drop_before_join, errors='ignore')

        df = df.join(split_cols_final)


    # ✅ Improved camp_code splitting (keep this logic as is)
    if "camp_code" in df.columns:
        # ... (your existing camp_code splitting logic) ...
        df["camp_code"] = df["camp_code"].fillna("").astype(str)
        # First, split from the right on last two hyphens → Start Date and End Date
        campaign_split = df["camp_code"].str.rsplit("-", n=2, expand=True)
        campaign_split_final = pd.DataFrame() # Initialize empty DataFrame

        if campaign_split.shape[1] == 3:
            prefix_part = campaign_split[0]
            start_date_str = campaign_split[1]
            end_date_str = campaign_split[2]

            # Now, split the prefix part into the first 4 components
            prefix_split = prefix_part.str.split("-", n=2, expand=True)
            # Ensure prefix_split has 4 columns, padding if necessary
            for i in range(prefix_split.shape[1], 3):
                 prefix_split[i] = "" # Add empty columns if split is short
            prefix_split.columns = ["Category", "Brand Name", "Campaign Name"]


            # Combine all parts
            campaign_split_final = pd.concat([prefix_split, start_date_str, end_date_str], axis=1)
            campaign_split_final.columns = ["Category", "Brand Name", "Campaign Name", "Plan Start Date", "Plan End Date"]

             # Drop existing columns before joining to avoid conflict
            df = df.drop(columns=[col for col in campaign_split_final.columns if col in df.columns], errors="ignore")
            df = df.join(campaign_split_final)
        # else: handle cases where camp_code doesn't split into 3 parts if needed


    # ✅ free_text1 splitting (keep this logic as is)
    if "free_text1" in df.columns:
        # ... (your existing free_text1 splitting logic) ...
        df["free_text1"] = df["free_text1"].fillna("").astype(str)
        freetext_split = df["free_text1"].str.split("-", expand=True)

        max_cols = 6
        # Select up to max_cols, don't error if fewer exist
        freetext_split = freetext_split.iloc[:, :min(freetext_split.shape[1], max_cols)]

        # Pad with NaN if fewer than max_cols resulted from the split
        for i in range(freetext_split.shape[1], max_cols):
            freetext_split[i] = np.nan # Use standard column names

        # Assign standard names
        freetext_split.columns = ["Audience group", "Gender", "Age", "Region", "Device", "Os"][:freetext_split.shape[1]] # Only assign names for existing columns

        df = df.drop(columns=[col for col in freetext_split.columns if col in df.columns], errors="ignore")
        df = df.join(freetext_split)


    # ✅ Target Audience creation (keep this logic as is)
    if "Gender" in df.columns and "Age" in df.columns:
         # Ensure columns exist before proceeding
        df["Gender"] = df["Gender"].fillna("NA").astype(str) # Handle potential NaNs from split
        df["Age"] = df["Age"].fillna("0000").astype(str).str.zfill(4) # Handle NaNs and ensure padding
        # Use .str accessor safely
        df["TA"] = df["Gender"] + df["Age"].str[:2] + "-" + df["Age"].str[2:]


    # ✅ Date formatting
    # Ensure 'year' column exists and handle potential errors before using iloc[0]
    current_year = datetime.now().year # Default year if 'year' column is missing/empty
    if "year" in df.columns and not df["year"].isnull().all():
        # Try converting the first valid year value to int
        try:
            first_valid_year_index = df['year'].first_valid_index()
            if first_valid_year_index is not None:
                current_year = int(df['year'].loc[first_valid_year_index])
            else:
                 print("Warning: 'year' column has no valid values. Using current system year.")
        except (ValueError, TypeError):
            print(f"Warning: Could not convert value in 'year' column to integer. Using current system year: {current_year}")


    if "Plan Start Date" in df.columns and "Plan End Date" in df.columns:
        def parse_ddmm(val, year_to_use):
            if pd.isna(val) or val == '':
                return pd.NaT
            try:
                # Handle potential float inputs from splits
                val_str = str(val).split('.')[0] if isinstance(val, float) else str(val)
                return datetime.strptime(val_str, "%d%m").replace(year=year_to_use)
            except (ValueError, TypeError):
                 # Handle cases like 'NaT', '', invalid formats
                return pd.NaT

        # Apply using the determined year
        df["Plan Start Date"] = df["Plan Start Date"].apply(lambda x: parse_ddmm(x, current_year))
        df["Plan End Date"] = df["Plan End Date"].apply(lambda x: parse_ddmm(x, current_year))


    # ✅ Combine creative fields (keep this logic as is)
    if all(col in df.columns for col in ["creative_type", "creative_format", "creative_length"]):
        df["ad_format"] = (df["creative_type"].fillna("").astype(str) + "_" +
                           df["creative_format"].fillna("").astype(str) + "_" +
                           df["creative_length"].fillna("").astype(str))

    # ✅ Handle freetext_2 as Actual Start and End Dates (from the last date pattern in the text)
    if "freetext_2" in df.columns:
        df["freetext_2"] = df["freetext_2"].fillna("").astype(str)

        def parse_freetext2(val, year_to_use):
            # Find all matches of the date pattern in the text
            matches = re.findall(r"(\d{4})-(\d{4})", val)
            if not matches:
                return pd.NaT, pd.NaT

            # Use the last match
            start_str, end_str = matches[-1]

            try:
                start_date = datetime.strptime(start_str, "%d%m").replace(year=year_to_use)
            except ValueError:
                start_date = pd.NaT
            try:
                end_date = datetime.strptime(end_str, "%d%m").replace(year=year_to_use)
            except ValueError:
                end_date = pd.NaT

            return start_date, end_date

        actual_dates = df["freetext_2"].apply(lambda x: pd.Series(parse_freetext2(x, current_year)))
        actual_dates.columns = ["Actual Start Date", "Actual End Date"]

        df = df.drop(columns=[col for col in actual_dates.columns if col in df.columns], errors="ignore")
        df = df.join(actual_dates)

    # ✅ Drop unnecessary columns (keep this logic as is)
    df = df.drop(columns=[col for col in df.columns if "noneed" in col], errors="ignore")

    return df


# ... (keep your rename and process_campaign_data function) ...

def process_campaign_data(df):
    """Processes a DataFrame by filtering campaigns, splitting columns, and formatting TA & dates."""

    # Always start with a copy to avoid modifying the original DataFrame
    df_filtered = df.copy()
    # Informational warning if 'campaign_name' is not found
    if 'campaign_name' not in df.columns:
        print("Warning: 'campaign_name' column not found. Skipping campaign_name splitting.")
    else:
        # Apply splitting logic to 'campaign_name'
        df_filtered = split_and_assign(df_filtered, 'campaign_name', campaign_name_expected_columns)

    # *** Check if 'ad_group' or 'ad_group_name' exists after renames ***
    if 'ad_group_name' in df_filtered.columns:
        df_filtered = split_and_assign(df_filtered, 'ad_group_name', adgroup_name_expected_columns)
    elif 'ad_group' in df_filtered.columns:
        print("Warning: Processing 'ad_group' instead of 'ad_group_name'")
        df_filtered = split_and_assign(df_filtered, 'ad_group', adgroup_name_expected_columns)

    # *** Check if 'ad_name' exists after renames ***
    if 'ad_name' in df_filtered.columns:
        df_filtered = split_and_assign(df_filtered, 'ad_name', ad_name_expected_columns)

    return df_filtered


# Assuming df_tt, df_fb, df_yt are loaded DataFrames
# Perform renames *before* calling process_campaign_data
# df_yt = df_yt.rename(columns={'campaign': 'campaign_name', 'ad': 'ad_name', 'ad_group': 'ad_group_name'})
# Add similar renames for df_fb and df_tt if they use different original names

df_yt_filter = process_campaign_data(df_yt)
df_fb_filter = process_campaign_data(df_fb) # Make sure df_fb has 'campaign_name', etc.
df_tt_filter = process_campaign_data(df_tt) # Make sure df_tt has 'campaign_name', etc.



In [250]:
df_yt_filter2 = df_yt_filter.copy()
df_tt_filter2 = df_tt_filter.copy()
df_fb_filter2 = df_fb_filter.copy()

In [251]:
df_yt_filter = df_yt_filter[~df_yt_filter["Funnel"].astype(str).str.contains("-", na=False)]
df_fb_filter = df_fb_filter[~df_fb_filter["Funnel"].astype(str).str.contains("-", na=False)]
df_tt_filter = df_tt_filter[~df_tt_filter["Funnel"].astype(str).str.contains("-", na=False)]


In [252]:
mapping_naming(df_yt_filter)
mapping_naming(df_fb_filter)
mapping_naming(df_tt_filter)

In [253]:
def calculate_cost(row):
    if row['currency_code'] == 'USD':
        return row['cost'] * 26000
    elif row['currency_code'] == 'VND':
        return row['cost'] * 1  # No exchange rate needed for VND
    else:
        return row['cost']  # For other currencies, just return the cost in USD
df_yt_filter['Cost'] = df_yt_filter.apply(calculate_cost, axis=1)

In [254]:
import pandas as pd

def process_dataframe(df, column_mapping, numeric_cols, extra_processing=None, column_order=None):
    """General function to process dataframes."""
    # Select and rename columns
    existing_columns = [col for col in column_mapping if col in df.columns]
    df = df[existing_columns].rename(columns=column_mapping).copy()
    
    # Convert 'Date' column
    df['report_date'] = pd.to_datetime(df['report_date'], errors='coerce')
    df['report_date'] = pd.to_datetime(df['report_date'], errors='coerce').dt.date

    # Convert numeric columns
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Apply dataset-specific processing
    if extra_processing:
        extra_processing(df)
    
    # Ensure all expected columns exist
    if column_order:
        for col in column_order:
            if col not in df.columns:
                df[col] = None
        df = df[column_order]
    
    return df


# Define column mappings & numeric columns for each dataset
youtube_mapping = {'report_date': 'report_date',
    'Category': 'Category',
    'Funnel': 'Funnel',
    'Platform': 'Platform',
    'Brand Name': 'Brand',
    'placement': 'Video_Range',
    'buying_model': 'Buying_Method',
    'TA': 'TA',
    'Campaign phase': 'Campaign_phase',
    'Audience group': 'Audience_group',
    'Region': 'Region',
    'campaign_name': 'Campaign_Name',
    'camp_code': 'Campaign_code',
    'Campaign Name': 'Campaign',
    'Plan Start Date': 'Plan_Start_Date',
    'Plan End Date': 'Plan_End_Date',
    'Actual Start Date': 'Actual_Start_Date',
    'Actual End Date': 'Actual_End_Date',
    'freetext_2': 'freetext_2',
    'camp_type': 'Format',
    'impressions': 'Impression',
    'clicks': 'Clicks',
    'views': 'Views',
    'currency_code': 'Currency_Code',
    'Cost': 'Cost',
    'engagements': 'Engagements',
    'video_25': 'Video_Plays_25',
    'video_50': 'Video_Plays_50',
    'video_75': 'Video_Plays_75',
    'video_100': 'Video_Plays_100'}  
tiktok_mapping = {'report_date': 'report_date',
    'Category': 'Category',
    'Funnel': 'Funnel',
    'Platform': 'Platform',
    'Brand Name': 'Brand',
    'TA': 'TA',
    'Campaign phase': 'Campaign_phase',
    'Audience group': 'Audience_group',
    'Region': 'Region',
    'camp_type': 'Format',
    'buying_model': 'Buying_Method',
    'campaign_name': 'Campaign_Name',
    'camp_code': 'Campaign_code',
    'Campaign Name': 'Campaign',
    'Plan Start Date': 'Plan_Start_Date',
    'Plan End Date': 'Plan_End_Date',
    'Actual Start Date': 'Actual_Start_Date',
    'Actual End Date': 'Actual_End_Date',
    'freetext_2': 'freetext_2',
    'spend': 'Cost',
    'impressions': 'Impression',
    'clicks_destination': 'Clicks',
    'engagements': 'Engagements',
    'profile_visits': 'profile_visits',
    '2_second_video_views': '23s_Video_Views',
    '6_second_video_views': 'Views',
    'engaged_view': '6s_Engaged_Views',
    'video_views_p25': 'Video_Plays_25',
    'video_views_p50': 'Video_Plays_50',
    'video_views_p75': 'Video_Plays_75',
    'video_views_p100': 'Video_Plays_100'}
facebook_mapping = {'report_date': 'report_date',
    'Category': 'Category',
    'Funnel': 'Funnel',
    'Platform': 'Platform',
    'Brand Name': 'Brand',
    'camp_type': 'Format',
    'buying_model': 'Buying_Method',
    'TA': 'TA',
    'Campaign phase': 'Campaign_phase',
    'Audience group': 'Audience_group',
    'Region': 'Region',
    'campaign_name': 'Campaign_Name',
    'camp_code': 'Campaign_code',
    'Campaign Name': 'Campaign',
    'Plan Start Date': 'Plan_Start_Date',
    'Plan End Date': 'Plan_End_Date',
    'Actual Start Date': 'Actual_Start_Date',
    'Actual End Date': 'Actual_End_Date',
    'freetext_2': 'freetext_2',
    'impression': 'Impression',
    'cost': 'Cost',
    'post_engagement': 'Engagements',
    'video_played_3': '23s_Video_Views',
    'thruplays': 'Views',
    'video_played_25': 'Video_Plays_25',
    'video_played_50': 'Video_Plays_50',
    'video_played_75': 'Video_Plays_75',
    'video_played_complete': 'Video_Plays_100',
    'link_click': 'Clicks'}

youtube_numeric_cols = ['Engagement', '23s_Video_Views', 'Impression',
    'Cost', 'Clicks', 'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 
    'Video_Plays_100', 'Views']
tiktok_numeric_cols = ['Engagement', '23s_Video_Views', 'Impression',
    'Cost', 'Clicks', 'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 
    'Video_Plays_100', 'Views']
facebook_numeric_cols = ['Engagement', '23s_Video_Views', 'Impression',
    'Cost', 'Clicks', 'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 
    'Video_Plays_100', 'Views']


# Process datasets
report_campaign_region_youtube = process_dataframe(df_yt_filter, youtube_mapping, youtube_numeric_cols)
report_campaign_region_tiktok = process_dataframe(df_tt_filter, tiktok_mapping, tiktok_numeric_cols)
report_campaign_region_facebook = process_dataframe(df_fb_filter, facebook_mapping, facebook_numeric_cols)


# report_campaign_overall_youtube['Platform'] = 'YouTube'
# report_campaign_overall_tiktok['Platform'] = 'TikTok'
# report_campaign_overall_facebook['Platform'] = 'Facebook'

report_campaign_region_total = pd.concat([report_campaign_region_youtube, report_campaign_region_tiktok, report_campaign_region_facebook], ignore_index=True)

# ✅ Step 4: Summarize at higher level (Brand + Campaign)
summary_columns = ['report_date', 'Campaign_phase', 'Category', 'Platform', 'Brand', 'Campaign', 'Format', 'Buying_Method', 'TA', 'Region', 'Plan_Start_Date', 'Plan_End_Date', 'Campaign_code', 'Funnel', 'Audience_group', 'Actual_Start_Date', 'Actual_End_Date']
metric_columns = ['Impression', 'Clicks', 'Views', 'Cost', 'Engagements',
                  'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 'Video_Plays_100']

# Filter only metrics that exist in the dataset
existing_metrics = [col for col in metric_columns if col in report_campaign_region_total.columns]

# Group and aggregate
report_campaign_region_summary = (
    report_campaign_region_total
    .groupby(summary_columns)[existing_metrics]
    .sum()
    .reset_index()
)

report_campaign_region_summary

Unnamed: 0,report_date,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,Region,...,Actual_End_Date,Impression,Clicks,Views,Cost,Engagements,Video_Plays_25,Video_Plays_50,Video_Plays_75,Video_Plays_100
0,2024-12-19,,SN,TikTok,ADM,ADMMVLaunch,Reach,CPM,F25-44,26C,...,2025-05-25,0,0,0,0.0,0,0.0,0.0,0.0,0.0
1,2024-12-19,,SN,TikTok,ADM,ADMMVLaunch,Reach,CPM,F25-44,35C,...,2025-05-25,0,0,0,0.0,0,0.0,0.0,0.0,0.0
2,2024-12-19,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Livestream Ads,CPC,P18-54,13C,...,2025-05-04,0,0,0,0.0,0,0.0,0.0,0.0,0.0
3,2024-12-19,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Livestream Ads,CPC,P18-54,13C,...,2025-05-07,0,0,0,0.0,0,0.0,0.0,0.0,0.0
4,2024-12-19,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Livestream Ads,CPC,P18-54,13C,...,2025-05-08,0,0,0,0.0,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15013,2025-07-05,Phase2,SN,TikTok,FM100,FM100PromoAprMay,Reach,CPM,F25-44,NAT,...,2025-05-17,0,0,0,0.0,0,0.0,0.0,0.0,0.0
15014,2025-07-05,Phase2,SN,YouTube,FM100,FM100PromoSustainFeb,View Reach Campaign,CPM,F25-44,NAT,...,2025-02-25,0,0,0,0.0,0,0.0,0.0,0.0,0.0
15015,2025-07-05,Phase3,SN,Google Ads,FM100,FM100PromoAprMay,View Reach Campaign,CPM,F25-44,NAT,...,2025-05-30,0,0,0,0.0,0,0.0,0.0,0.0,0.0
15016,2025-07-05,Phase3,SN,TikTok,FM100,FM100PromoAprMay,Reach,CPM,F25-44,NAT,...,2025-05-31,0,0,0,0.0,0,0.0,0.0,0.0,0.0


In [255]:
report_campaign_region_summary['Campaign_phase'] = report_campaign_region_summary['Campaign_phase'].replace('NA', '')

In [256]:
report_campaign_summary = report_campaign_region_summary[
    (report_campaign_region_summary["Impression"] != 0) & 
    (report_campaign_region_summary["Cost"] != 0)
]

## GA4

In [21]:
import pandas as pd
import numpy as np 
# Assuming df_ga is your DataFrame loaded previously

# --- 1. Split sessionManualCampaignName ---
print("Processing session_manual_campaign_name...")
target_cols_1 = ['Campaign', 'Category', 'Brand', 'Dentsu', 'Actual_Start_Date']
n_target_1 = len(target_cols_1)

# Perform the split safely, filling NaNs in the source column first
split_data_1 = df_ga['session_manual_campaign_name'].fillna('').astype(str).str.split('_', expand=True)
n_actual_1 = split_data_1.shape[1]
print(f"  Split produced {n_actual_1} columns (expected {n_target_1}).")

# Select the minimum of actual columns produced and target columns
cols_to_assign_1 = min(n_target_1, n_actual_1)

# Assign the data for the columns that exist in both
df_ga[target_cols_1[:cols_to_assign_1]] = split_data_1.iloc[:, :cols_to_assign_1]

# If the split produced fewer columns than expected, fill the remaining target columns with NaN/None
if n_actual_1 < n_target_1:
    print(f"  Padding {n_target_1 - n_actual_1} missing columns.")
    for i in range(n_actual_1, n_target_1):
        df_ga[target_cols_1[i]] = pd.NA # Or np.nan or None or ''


# --- 2. Split sessionManualAdContent ---
print("Processing session_manual_ad_content...")
target_cols_2 = ['Content Angle', 'Tên Content', 'Tên Thiết Kế', 'Creative Format']
n_target_2 = len(target_cols_2)

# Perform the split safely
split_data_2 = df_ga['session_manual_ad_content'].fillna('').astype(str).str.split('_', expand=True)
n_actual_2 = split_data_2.shape[1]
print(f"  Split produced {n_actual_2} columns (expected {n_target_2}).")

cols_to_assign_2 = min(n_target_2, n_actual_2)
df_ga[target_cols_2[:cols_to_assign_2]] = split_data_2.iloc[:, :cols_to_assign_2]

if n_actual_2 < n_target_2:
    print(f"  Padding {n_target_2 - n_actual_2} missing columns.")
    for i in range(n_actual_2, n_target_2):
        df_ga[target_cols_2[i]] = pd.NA


# --- 3. Split sessionManualMedium ---
print("Processing session_manual_medium...") # Corrected column name based on code
target_cols_3 = ['Buying Method', 'Bid Strategy/Objective', 'Targeting Method', 'Audience', 'Age', 'Gender', 'Region']
n_target_3 = len(target_cols_3)

# Check if 'session_manual_mmedium' exists, otherwise use 'sessionManualMedium' or correct it
source_col_3 = 'session_manual_medium' #<-- Make sure this is the EXACT column name in df_ga
if source_col_3 not in df_ga.columns:
     # Attempt common variations or raise error
     if 'sessionManualMedium' in df_ga.columns:
         source_col_3 = 'sessionManualMedium'
     elif 'session_manual_mmedium' in df_ga.columns: # Check the original typo
         source_col_3 = 'session_manual_mmedium'
     else:
         raise KeyError(f"Column for medium split not found. Tried 'session_manual_medium', 'sessionManualMedium', 'session_manual_mmedium'. Please verify column name.")
print(f"  Using source column: {source_col_3}")

# Perform the split safely
split_data_3 = df_ga[source_col_3].fillna('').astype(str).str.split('_', expand=True)
n_actual_3 = split_data_3.shape[1]
print(f"  Split produced {n_actual_3} columns (expected {n_target_3}).")

cols_to_assign_3 = min(n_target_3, n_actual_3)
df_ga[target_cols_3[:cols_to_assign_3]] = split_data_3.iloc[:, :cols_to_assign_3]

if n_actual_3 < n_target_3:
    print(f"  Padding {n_target_3 - n_actual_3} missing columns.")
    for i in range(n_actual_3, n_target_3):
        df_ga[target_cols_3[i]] = pd.NA


# --- 4. Split sessionManualSource (Platform_Objective-BuyingType) ---
print("Processing session_manual_source...")
# First split by '_'
split_source_1 = df_ga['session_manual_source'].fillna('_').astype(str).str.split('_', n=1, expand=True) # n=1 ensures max 2 columns

# Assign temporary columns, handling cases where split might not occur
df_ga['Platform_temp'] = split_source_1[0]
if split_source_1.shape[1] > 1:
    df_ga['Objective-BuyingType_temp'] = split_source_1[1]
else:
    df_ga['Objective-BuyingType_temp'] = '' # Or pd.NA

# Then split the second part by '-'
split_source_2 = df_ga['Objective-BuyingType_temp'].fillna('-').astype(str).str.split(' ', n=1, expand=True) # n=1 ensures max 2 columns

# Assign final columns
df_ga['Platform'] = df_ga['Platform_temp'] # Assign directly from first split
df_ga['Objective'] = split_source_2[0]
if split_source_2.shape[1] > 1:
    df_ga['Buying Type'] = split_source_2[1]
else:
    df_ga['Buying Type'] = '' # Or pd.NA

# Drop intermediate columns
df_ga.drop(columns=['Platform_temp', 'Objective-BuyingType_temp'], inplace=True)

print("Processing finished.")
# Display the first few rows or info of the modified DataFrame
# print(df_ga.head())
# print(df_ga.info())

Processing session_manual_campaign_name...
  Split produced 10 columns (expected 5).
Processing session_manual_ad_content...
  Split produced 5 columns (expected 4).
Processing session_manual_medium...
  Using source column: session_manual_medium
  Split produced 7 columns (expected 7).
Processing session_manual_source...
Processing finished.


In [18]:
df_lookup = pd.read_excel('/Users/khanhvx/Downloads/VNM_UTM dimensions_Jun 19.xlsx')

In [22]:
df_lookup['code'] = df_lookup['code'].astype(str)

# Create the mapping dictionary from df_lookup
decode_map = dict(zip(df_lookup['code'], df_lookup['name']))

# List of columns to decode
cols_to_decode = [
    'Campaign', 'Category', 'Brand', 'Dentsu', 'Start_Date',
    'Content Angle', 'Tên Content', 'Tên Thiết Kế', 'Creative Format',
    'Buying Method', 'Bid Strategy/Objective', 'Targeting Method',
    'Audience', 'Age', 'Gender', 'Region',
    'Platform', 'Objective-BuyingType', 'Objective'
]

# 4. Decode in-place by mapping string version of the value
for col in cols_to_decode:
    if col in df_ga.columns:
        df_ga[col] = df_ga[col].astype(str).map(decode_map).fillna(df_ga[col])

df_ga

Unnamed: 0,report_date,session_manual_campaign_name,session_manual_ad_content,session_manual_medium,session_manual_source,sessions,engaged_sessions,add_to_carts,purchase_revenue,purchaser_rate,...,Bid Strategy/Objective,Targeting Method,Audience,Age,Gender,Region,Platform,Objective,Buying Type,TA
0,2025-04-28,3304_sn_fm_3011_2025/04/21,3305_3305_3305_226,86_85_52_60_1545_359_177,184_362,1299,1262,2,0.0,0.0,...,Impression,Affinity,none,2544,Female,Toàn Quốc,Youtube,VRC,,F25-44
1,2025-04-28,3340_sn_greenfarm_3011_2025/04/25,323_323_323_226,185_6_71_60_25_359_177,8_669,755,29,0,0.0,0.0,...,View,Broad,none,18-24,Female,Toàn Quốc,Facebook,Awareness,,F18--24
2,2025-04-28,3211_sn_fm_3011_2025/03/26,3212_3212_3212_68,86_30_52_60_1545_359_177,184_362,744,730,2,0.0,0.0,...,Reach,Affinity,none,2544,Female,Toàn Quốc,Youtube,VRC,,F25-44
3,2025-04-28,3282_sbnl_surepreventgold_3011_2025/04/21,3296_3259_1113_226,86_85_201_60_1545_359_2630,8_197,510,505,5,0.0,0.0,...,Impression,Prospecting,none,2544,Female,TOP20C,Facebook,Reach auction,,F25-44
4,2025-04-28,3224_sn_greenfarm_3011_2025/03/28,323_323_323_na,86_85_71_60_1545_359_177,184_362,474,464,1,0.0,0.0,...,Impression,Broad,none,2544,Female,Toàn Quốc,Youtube,VRC,,F25-44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13324,2025-06-16,3387_sn_adm_3011_2025/05/12,323NewPack_3133_3133_32,86_85_52_60_1545_359_378,184_362,1,1,0,0.0,0.0,...,Impression,Affinity,none,2544,Female,TOPC,Youtube,VRC,,F25-44
13325,2025-06-16,3390_bdd_ridielacgold_3011_2025/05/17,107_107_107_226,86_85_370_60_na_359_na,76_368,1,1,0,0.0,0.0,...,Impression,Interest,none,na,Female,na,Tiktok,Infeed,,Fna-
13326,2025-06-16,3419_stv_stv_supersoy_3011_2025/06/12,3420_323_323_32,86_85_52_60_1545_359_39,76_368,1,1,0,0.0,0.0,...,Impression,Affinity,none,2544,Female,Miền Bắc,Tiktok,Infeed,,F25-44
13327,2025-06-16,3419_stv_stv_supersoy_3011_2025/06/12,3420_323_323_32,86_85_52_60_155_359_39,184_362,1,1,0,0.0,0.0,...,Impression,Affinity,none,45-55,Female,Miền Bắc,Youtube,VRC,,F45--55


In [24]:
df_ga_sn = df_ga[df_ga['session_manual_campaign_name'].str.contains('sn_')]
df_ga_sn.Campaign.unique()

array(['FM100PromoMay', 'GFPromotionView', 'FM100Apr', 'GFCorePromotion',
       'FM100Promo', 'GFCore', 'GFYogurt', 'GFPromotion',
       'GFPromotionMarApr', 'GFSustain', 'SocialAWO', 'greenfarmpromote',
       'Promotion', 'FinoPromoT1', 'cpasmarch2025greenfarmhipronewcus',
       'GreenFarm', '2025DSASEM', 'PromotionT325', 'FM100LAUNCHING',
       '2025D2COFFLINEJAN', 'DIRECTOFF20PERCENT',
       'cpasmarch2025greenfarmhiprooldcus', 'FinoADMMVLaunch',
       'ADMMVLaunch', 'A2Teasing', 'Promo', 'ADMMVLaunchJunJul',
       'STDSocialAWOJun'], dtype=object)

In [56]:
df_ga_sn.to_excel('/Users/khanhvx/Documents/Work/SC/VNM/Campaign report/region-breakdown ga1.xlsx', index=False)

In [181]:
df_ga_revised = pd.read_excel('/Users/khanhvx/Documents/Work/SC/VNM/Campaign report/region-breakdown ga.xlsx')

In [182]:
df_ga_revised['Brand'] = df_ga_revised['Brand'].str.upper()

group_cols = [
   'report_date', 'session_manual_campaign_name',
       'session_manual_ad_content', 'session_manual_medium',
       'session_manual_source', 'Campaign', 'Category', 'Brand', 'Dentsu', 'Start_Date',
    'Content Angle', 'ten', 'ten thiet ke', 'Creative Format',
    'Buying Method', 'Objective', 'Targeting Method',
    'Audience', 'Age', 'Gender', 'Region', 'Funnel', 'Campaign_code', 'TA',
    'Platform', 'Format'
]

numeric_cols = [
    'sessions', 'engaged_sessions', 'add_to_carts',
       'purchase_revenue', 'purchaser_rate', 'total_purchasers',
       'screen_page_views', 'ecommerce_purchases'
]

# --- Step 11: Sort + compute cumulative sums ---
groupby_cols_for_cumsum = [col for col in group_cols if col != 'report_date']
df_ga_revised_cumsum = df_ga_revised.sort_values(by=groupby_cols_for_cumsum + ['report_date'])

for col in numeric_cols:
    if col != 'reach':
        df_ga_revised_cumsum[f'cumsum_{col}'] = df_ga_revised_cumsum.groupby(groupby_cols_for_cumsum)[col].cumsum()

In [183]:
df_ga_revised_cumsum.columns = df_ga_revised_cumsum.columns.str.replace(' ', '_', regex=False)
df_ga_revised_cumsum.columns

Index(['report_date', 'session_manual_campaign_name',
       'session_manual_ad_content', 'session_manual_medium',
       'session_manual_source', 'sessions', 'engaged_sessions', 'add_to_carts',
       'purchase_revenue', 'purchaser_rate', 'total_purchasers',
       'screen_page_views', 'ecommerce_purchases', 'Campaign', 'Category',
       'Brand', 'Dentsu', 'Start_Date', 'Content_Angle', 'ten', 'ten_thiet_ke',
       'Creative_Format', 'Buying_Method', 'Objective', 'Targeting_Method',
       'Audience', 'Age', 'Gender', 'Region', 'Platform', 'Format',
       'Buying_Type', 'Funnel', 'Campaign_code', 'TA', 'cumsum_sessions',
       'cumsum_engaged_sessions', 'cumsum_add_to_carts',
       'cumsum_purchase_revenue', 'cumsum_purchaser_rate',
       'cumsum_total_purchasers', 'cumsum_screen_page_views',
       'cumsum_ecommerce_purchases'],
      dtype='object')

In [55]:

DB_TYPE = 'mysql'  # Change to 'postgresql' for PostgreSQL
DB_HOST = '125.212.245.36'
DB_PORT = '3306'  # Change for different databases
DB_USER = 'dashboard_report_vinamilk'
DB_PASS = '6vtRhNxa79zPsfs5'
DB_NAME = 'dashboard_report_vinamilk'

try:
    conn = mysql.connector.connect(
        host=DB_HOST,
        user=DB_USER,
        password=DB_PASS,
        database=DB_NAME,
        charset="utf8mb4"
    )
    
    if conn.is_connected():
        cursor = conn.cursor()
        print("Connected to MySQL successfully!")
    else:
        print("Failed to connect.")

except mysql.connector.Error as e:
    print(f"Error: {e}")

# Create SQLAlchemy engine
cursor = conn.cursor()
cursor.execute("SET NAMES utf8mb4;") 
cursor.execute("SET CHARACTER SET utf8mb4;")
engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

upload_to_mysql(df_ga_revised_cumsum, 'report_campaign_ga3', engine, if_exists='replace')

Connected to MySQL successfully!
✅ Data successfully uploaded to `report_campaign_ga3` with correct column types!


In [184]:
df_ga = df_ga_revised.copy()

In [185]:
# Step 1: Replace 'Toàn Quốc' with 'NAT' in Geo
df_ga['Region'] = df_ga['Region'].replace('Toàn Quốc', 'NAT')

# Step 2: Create TA column from Gender and Age
def make_ta(gender, age):
    try:
        first_letter = gender[0].upper()
        age_range = f"{age[:2]}-{age[2:]}"
        return f"{first_letter}{age_range}"
    except:
        return None  # or you can return f"{gender}_{age}" as fallback

df_ga['TA'] = df_ga.apply(lambda row: make_ta(row['Gender'], row['Age']), axis=1)
df_ga

Unnamed: 0,report_date,session_manual_campaign_name,session_manual_ad_content,session_manual_medium,session_manual_source,sessions,engaged_sessions,add_to_carts,purchase_revenue,purchaser_rate,...,Audience,Age,Gender,Region,Platform,Format,Buying Type,Funnel,Campaign_code,TA
0,2025-04-28,3304_sn_fm_3011_2025/04/21,3305_3305_3305_226,86_85_52_60_1545_359_177,184_362,1299,1262,2,0,0.0,...,MIXED\nF25-44,2544,Female,NAT,Youtube,View Reach Campaign,,Awareness,SN-FM100-FM100PromoAprMay-2403-3105,F25-44
1,2025-04-28,3340_sn_greenfarm_3011_2025/04/25,323_323_323_226,185_6_71_60_25_359_177,8_669,755,29,0,0,0.0,...,none,18-24,Female,NAT,Facebook,Awareness,,Awareness,,F18--24
2,2025-04-28,3211_sn_fm_3011_2025/03/26,3212_3212_3212_68,86_30_52_60_1545_359_177,184_362,744,730,2,0,0.0,...,MIXED\nF25-44,2544,Female,NAT,Youtube,View Reach Campaign,,Awareness,SN-FM100-FM100PromoAprMay-2403-3105,F25-44
3,2025-04-28,3224_sn_greenfarm_3011_2025/03/28,323_323_323_na,86_85_71_60_1545_359_177,184_362,474,464,1,0,0.0,...,none,2544,Female,NAT,Youtube,View Reach Campaign,,Awareness,,F25-44
4,2025-04-28,3304_sn_fm_3011_2025/04/21,3305_3305_3305_226,86_85_52_60_1545_359_177,76_367,424,422,3,0,0.0,...,Group3\nF25-44,2544,Female,NAT,Tiktok,Reach,,Awareness,SN-FM100-FM100PromoAprMay-2403-3105,F25-44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3613,2025-06-16,3224_sn_greenfarm_3011_2025/03/28,323_323_323_na,86_85_71_60_1545_359_177,184_362,1,1,0,0,0.0,...,none,2544,Female,NAT,Youtube,View Reach Campaign,,Awareness,,F25-44
3614,2025-06-16,3230_sn_greenfarm_3011_2025/04/02,323_323_323_226,86_85_71_60_268_359_377,8_669,1,1,0,0,0.0,...,none,1844,Female,TOP6C,Facebook,Awareness,,Awareness,,F18-44
3615,2025-06-16,3304_sn_fm_3011_2025/04/21,3305_3305_3305_226,86_85_52_60_1545_359_177,8_197,1,1,0,0,0.0,...,Group1\nF25-44,2544,Female,NAT,Facebook,Reach,,Awareness,SN-FM100-FM100PromoAprMay-2403-3105,F25-44
3616,2025-06-16,3387_sn_adm_3011_2025/05/12,323NewPack_3133_3133_32,86_85_52_60_1545_359_378,184_362,1,1,0,0,0.0,...,MIXED\nF25-44,2544,Female,35C,Youtube,View Reach Campaign,x,Awareness,SN-ADM-ADMMVLaunch-1205-2505,F25-44


In [186]:
df_ga_summary = df_ga.groupby([
    'report_date', 
    'Campaign', 
    'Category', 
    'Brand', 
    'Start_Date', 
    'Buying Method', 
    'Region', 
    'Platform', 
    'TA'
])[[
    'sessions', 
    'engaged_sessions', 
    'add_to_carts', 
    'purchase_revenue', 
    'ecommerce_purchases', 
    'screen_page_views'
]].sum().reset_index()

df_ga_summary = df_ga_summary.rename(columns={
    'Start_Date': 'Actual_Start_Date',
    'Buying Method': 'Buying_Method',
})
df_ga_summary


Unnamed: 0,report_date,Campaign,Category,Brand,Actual_Start_Date,Buying_Method,Region,Platform,TA,sessions,engaged_sessions,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views
0,2025-01-03,Promotion,sn,FM100,2025/01/03,CPM,NAT,Tiktok,F25-44,2,2,0,0,0,0
1,2025-01-03,Promotion,sn,FM100,2025/01/03,CPR,NAT,Facebook,F25-44,17,13,0,0,0,4
2,2025-01-04,FM100Promosustainjan,sn,FM100,2025/01/03,CPM,NAT,Youtube,F25-44,1003,994,1,0,0,20
3,2025-01-04,Promotion,sn,FM100,2025/01/03,CPM,NAT,Tiktok,F25-44,353,352,4,0,0,6
4,2025-01-04,Promotion,sn,FM100,2025/01/03,CPM,NAT,Youtube,F25-44,785,774,3,0,0,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2500,2025-06-22,GFCorePromotion,sn,GREENFARM,2025/03/28,CPM,NAT,Youtube,F25-44,20,20,0,0,0,0
2501,2025-06-22,GFSustain,sn,GREENFARM,2025/02/22,CPM,TOP37C,Youtube,F25-44,1,1,0,0,0,0
2502,2025-06-22,GFYogurt,sn,GREENFARM,2025/04/02,CPM,TOP6C,Youtube,F18-44,9,9,0,0,0,0
2503,2025-06-22,GFYogurt,sn,GREENFARM,2025/04/02,CPV,TOP6C,Youtube,F18-44,6,6,0,0,0,0


In [187]:
columns_to_drop = [
    'session_manual_campaign_name', 'session_manual_ad_content',
    'session_manual_medium', 'session_manual_source', 'Dentsu',
    'Tên Content', 'Tên Thiết Kế', 'Bid Strategy/Objective',
    'Objective', 'Buying Type', 'Targeting Method', 'Gender', 'Age'
]

df_ga = df_ga.drop(columns=columns_to_drop)

KeyError: "['Tên Content', 'Tên Thiết Kế', 'Bid Strategy/Objective'] not found in axis"

### Merge region and GA

In [41]:
import pandas as pd

merge_cols = ['Campaign', 'report_date', 'Category', 'Platform', 'Brand', 'TA', 'Buying_Method', 'Region']

# --- Normalize Helper Function ---
def normalize_merge_keys(df, cols, prefix):
    df = df.copy()
    for col in cols:
        new_col = f'{prefix}_{col}'
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            df[new_col] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')
        else:
            df[new_col] = (
                df[col].astype(str)
                .str.strip().str.lower()
                .str.replace(r'\s+', '', regex=True)
            )
    return df

# --- Normalize both datasets ---
df_ga_summary = normalize_merge_keys(df_ga_summary, merge_cols, 'ga')
report_campaign_region_summary = normalize_merge_keys(report_campaign_region_summary, merge_cols, 'total')

# --- Create key columns for matching ---
df_ga_summary['merge_key'] = df_ga_summary[[f'ga_{col}' for col in merge_cols]].agg('_'.join, axis=1)
report_campaign_region_summary['merge_key'] = report_campaign_region_summary[[f'total_{col}' for col in merge_cols]].agg('_'.join, axis=1)

# --- Prepare for merge ---
df_ga_summary = df_ga_summary.copy()
report_campaign_region_summary = report_campaign_region_summary.copy()

# --- Dictionary to hold assigned rows ---
assigned_rows = {}

# --- Build index on GA for fast lookup ---
ga_lookup = df_ga_summary.set_index('merge_key')

# --- Match one-by-one and pop from GA summary once used ---
matched_ga_rows = []
used_keys = set()

for idx, row in report_campaign_region_summary.iterrows():
    key = row['merge_key']
    if key in ga_lookup.index and key not in used_keys:
        ga_row = ga_lookup.loc[key]
        if isinstance(ga_row, pd.DataFrame):  # in case of duplicate keys
            ga_row = ga_row.iloc[0]
        matched_ga_rows.append(ga_row)
        used_keys.add(key)
    else:
        matched_ga_rows.append(pd.Series([pd.NA] * df_ga_summary.shape[1], index=df_ga_summary.columns))

# --- Combine result ---
matched_ga_df = pd.DataFrame(matched_ga_rows).reset_index(drop=True)
merged_region_df = pd.concat([report_campaign_region_summary.reset_index(drop=True), matched_ga_df], axis=1)

# --- Clean up helper columns ---
helper_cols = [f'total_{col}' for col in merge_cols] + [f'ga_{col}' for col in merge_cols] + ['merge_key']
merged_region_df.drop(columns=helper_cols, inplace=True, errors='ignore')

# --- Done ---
merged_region_df

Unnamed: 0,report_date,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,Region,...,Buying_Method.1,Region.1,Platform.1,TA.1,sessions,engaged_sessions,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views
0,2024-12-19,,SN,TikTok,ADM,ADMMVLaunch,Reach,CPM,F25-44,26C,...,,,,,,,,,,
1,2024-12-19,,SN,TikTok,ADM,ADMMVLaunch,Reach,CPM,F25-44,35C,...,,,,,,,,,,
2,2024-12-19,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Livestream Ads,CPC,P18-54,13C,...,,,,,,,,,,
3,2024-12-19,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Livestream Ads,CPC,P18-54,13C,...,,,,,,,,,,
4,2024-12-19,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Livestream Ads,CPC,P18-54,13C,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14773,2025-07-01,Phase2,SN,TikTok,FM100,FM100PromoAprMay,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
14774,2025-07-01,Phase2,SN,YouTube,FM100,FM100PromoSustainFeb,View Reach Campaign,CPM,F25-44,NAT,...,,,,,,,,,,
14775,2025-07-01,Phase3,SN,Google Ads,FM100,FM100PromoAprMay,View Reach Campaign,CPM,F25-44,NAT,...,,,,,,,,,,
14776,2025-07-01,Phase3,SN,TikTok,FM100,FM100PromoAprMay,Reach,CPM,F25-44,NAT,...,,,,,,,,,,


In [42]:
# Drop duplicate columns already captured from GA summary
cols_to_exclude = set(merge_cols + ['Actual_Start_Date'])
ga_only_cols = [col for col in df_ga_summary.columns if col not in cols_to_exclude and col != 'merge_key']

# Rebuild merged dataframe with deduplicated base
merged_region_df = pd.concat([
    report_campaign_region_summary.reset_index(drop=True),
    matched_ga_df[ga_only_cols].reset_index(drop=True)
], axis=1)

In [43]:
merged_region_df['Audience'] = merged_region_df['Audience_group'].astype(str) + '\n' + merged_region_df['TA'].astype(str)
merged_region_df

Unnamed: 0,report_date,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,Region,...,screen_page_views,ga_Campaign,ga_report_date,ga_Category,ga_Platform,ga_Brand,ga_TA,ga_Buying_Method,ga_Region,Audience
0,2024-12-19,,SN,TikTok,ADM,ADMMVLaunch,Reach,CPM,F25-44,26C,...,,,,,,,,,,MIXED\nF25-44
1,2024-12-19,,SN,TikTok,ADM,ADMMVLaunch,Reach,CPM,F25-44,35C,...,,,,,,,,,,MIXED\nF25-44
2,2024-12-19,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Livestream Ads,CPC,P18-54,13C,...,,,,,,,,,,MIXED\nP18-54
3,2024-12-19,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Livestream Ads,CPC,P18-54,13C,...,,,,,,,,,,MIXED\nP18-54
4,2024-12-19,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Livestream Ads,CPC,P18-54,13C,...,,,,,,,,,,MIXED\nP18-54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14773,2025-07-01,Phase2,SN,TikTok,FM100,FM100PromoAprMay,Reach,CPM,F25-44,NAT,...,,,,,,,,,,Group3 \nF25-44
14774,2025-07-01,Phase2,SN,YouTube,FM100,FM100PromoSustainFeb,View Reach Campaign,CPM,F25-44,NAT,...,,,,,,,,,,MIXED\nF25-44
14775,2025-07-01,Phase3,SN,Google Ads,FM100,FM100PromoAprMay,View Reach Campaign,CPM,F25-44,NAT,...,,,,,,,,,,MIXED\nF25-44
14776,2025-07-01,Phase3,SN,TikTok,FM100,FM100PromoAprMay,Reach,CPM,F25-44,NAT,...,,,,,,,,,,Group1 ExcludeGroup3 \nF25-44


In [45]:
merged_region_df = merged_region_df[
    (merged_region_df["Impression"] != 0) & 
    (merged_region_df["Cost"] != 0)
]

In [46]:
import pandas as pd

# Assuming df_ga_summary and report_campaign_summary are already loaded

merge_cols = ['Campaign', 'Funnel', 'Category', 'Region', 'Platform', 'Brand', 'TA', 'Buying_Method', 'Plan_Start_Date', 'Actual_Start_Date', 'Audience_group']

# --- Refined Normalization ---
def normalize_cols(df, cols, prefix):
    for col in cols:
        # Handle dates explicitly first if they are datetime objects
        if pd.api.types.is_datetime64_any_dtype(df[col]):
             # Normalize to 'YYYY-MM-DD' format. Adjust if needed.
            df[f'{prefix}_{col}'] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')
        else:
            # Apply string normalization for others
            df[f'{prefix}_{col}'] = df[col].astype(str).str.strip().str.lower().str.replace(r'\s+', '', regex=True)
            # Handle potential 'nan' strings if they should be treated as actual NaN for merging (optional)
            # df[f'{prefix}_{col}'] = df[f'{prefix}_{col}'].replace('nan', pd.NA)
    return df

# Apply normalization
df_plan = normalize_cols(df_plan.copy(), merge_cols, 'plan') # Use .copy() to avoid SettingWithCopyWarning
merged_df2 = normalize_cols(merged_region_df.copy(), merge_cols, 'merge')

# Step 2: Merge using the helper columns
merged_df2 = pd.merge(
    merged_df2,
    df_plan,
    how='left', # TRY 'left' or 'outer' HERE FOR DEBUGGING
    right_on=[f'plan_{col}' for col in merge_cols],
    left_on=[f'merge_{col}' for col in merge_cols],
    suffixes=('', '_total')
)


# --- End Inspection ---


# Step 3: Drop helper columns (only if merge looks okay)
if not merged_df2.empty:
     merged_df2 = merged_df2.drop(columns=[f'plan_{col}' for col in merge_cols] + [f'merge_{col}' for col in merge_cols] + [f'{col}_total' for col in merge_cols])

merged_df2

Unnamed: 0,report_date,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,Region,...,frequency_estimate_week,frequency_combine_estimate_week,er_estimate,er_bm,vtr_estimate,vtr_bm,ctr_estimate,ctr_bm,exchange_rate,Actual_End_Date_total
0,2025-01-03,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,NaT
1,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,NaT
2,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,NaT
3,2025-01-04,Phase1,SN,TikTok,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,NaT
4,2025-01-04,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,NAT,...,,,,,,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,2025-06-23,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,3643.643644,,,,0.013,,0.0135,0.017078,26000.0,2025-06-30
1814,2025-06-24,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,3643.643644,,,,0.013,,0.0135,0.017078,26000.0,2025-06-30
1815,2025-06-25,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,3643.643644,,,,0.013,,0.0135,0.017078,26000.0,2025-06-30
1816,2025-06-26,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,3643.643644,,,,0.013,,0.0135,0.017078,26000.0,2025-06-30


In [47]:
if not merged_df2.empty:
    for col in merge_cols:
        merged_df2 = merged_df2[
            merged_df2[col] != f"norm_{col}"
        ]

In [48]:
merged_df2 = merged_df2.loc[:, ~merged_df2.columns.str.contains('norm_')]
merged_df2 = merged_df2.loc[:, ~merged_df2.columns.str.contains('total_')]
merged_df2 = merged_df2.loc[:, ~merged_df2.columns.str.contains('_total')]

In [49]:
# Columns to keep from df_plan (excluding merge keys)
columns_to_keep = ['ctr_bm', 'vtr_bm', 'er_bm']  # example columns you want from df_plan

# Identify the columns that exist in merged_df2
existing_plan_cols_to_drop = [
    f'plan_{col}' for col in columns_to_drop if f'plan_{col}' in merged_df2.columns
]

# Drop only those
merged_df2 = merged_df2.drop(columns=existing_plan_cols_to_drop)

NameError: name 'columns_to_drop' is not defined

In [50]:
merged_df2 = merged_df2.drop_duplicates(subset=['Platform', 'report_date', 'Campaign', 'Impression', 'Clicks'])
merged_df2

Unnamed: 0,report_date,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,Region,...,frequency_estimate,frequency_estimate_week,frequency_combine_estimate_week,er_estimate,er_bm,vtr_estimate,vtr_bm,ctr_estimate,ctr_bm,exchange_rate
0,2025-01-03,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
1,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
2,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
3,2025-01-04,Phase1,SN,TikTok,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
4,2025-01-04,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,NAT,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,2025-06-23,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,40080.08008,3643.643644,,,,0.013,,0.0135,0.017078,26000.0
1814,2025-06-24,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,40080.08008,3643.643644,,,,0.013,,0.0135,0.017078,26000.0
1815,2025-06-25,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,40080.08008,3643.643644,,,,0.013,,0.0135,0.017078,26000.0
1816,2025-06-26,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,40080.08008,3643.643644,,,,0.013,,0.0135,0.017078,26000.0


In [None]:
# merged_df['Start_Date'] = merged_df['Start_Date'].replace(pd.to_datetime('2025-04-03'), pd.to_datetime('2025-03-07'))
# merged_df['Campaign_code'] = merged_df['Campaign_code'].str.replace('-0304-0405', '-0703-0405')
# merged_df['Campaign_code'] = merged_df['Campaign_code'].str.replace('Optimum Gold', 'Optimum')
# merged_df['Campaign_code'] = merged_df['Campaign_code'].str.replace('Optimum Colos', 'Optimum')


In [51]:
from sqlalchemy import create_engine
import pandas as pd

DB_TYPE = 'mysql'  # Change to 'postgresql' for PostgreSQL
DB_HOST = '125.212.245.36'
DB_PORT = '3306'  # Change for different databases
DB_USER = 'dashboard_report_vinamilk'
DB_PASS = '6vtRhNxa79zPsfs5'
DB_NAME = 'dashboard_report_vinamilk'

try:
    conn = mysql.connector.connect(
        host=DB_HOST,
        user=DB_USER,
        password=DB_PASS,
        database=DB_NAME,
        charset="utf8mb4"
    )
    
    if conn.is_connected():
        cursor = conn.cursor()
        print("Connected to MySQL successfully!")
    else:
        print("Failed to connect.")

except mysql.connector.Error as e:
    print(f"Error: {e}")

# Create SQLAlchemy engine
cursor = conn.cursor()
cursor.execute("SET NAMES utf8mb4;") 
cursor.execute("SET CHARACTER SET utf8mb4;")
engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

def upload_to_mysql(df, table_name, engine, if_exists='replace'):
    """
    Uploads a DataFrame to a MySQL table with correct column types based on the DataFrame.

    :param df: Pandas DataFrame to upload
    :param table_name: Name of the table in the database
    :param engine: SQLAlchemy engine object
    :param if_exists: What to do if the table already exists ('fail', 'replace', 'append')
    """
    try:
        with engine.connect() as conn:
            metadata = MetaData()

            # Define table schema based on DataFrame's column types
            columns = []

            for col in df.columns:
                dtype = df[col].dtype
                
                # Map Pandas dtype to MySQL/SQLAlchemy types
                if pd.api.types.is_integer_dtype(dtype):
                    col_type = Integer
                elif pd.api.types.is_float_dtype(dtype):
                    col_type = Float
                elif pd.api.types.is_datetime64_any_dtype(dtype):
                    col_type = DateTime
                elif pd.api.types.is_bool_dtype(dtype):
                    col_type = Boolean
                elif pd.api.types.is_string_dtype(dtype):
                    col_type = Text(collation="utf8mb4_unicode_ci")  # UTF-8 support
                else:
                    col_type = String(255, collation="utf8mb4_unicode_ci")  # Default fallback

                columns.append(Column(col, col_type))

            table = Table(table_name, metadata, *columns, extend_existing=True)

            # Create the table with utf8mb4 encoding
            metadata.create_all(conn)

        # Convert object columns to string before uploading
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].astype(str)

        # Upload DataFrame to MySQL with proper type mapping
        dtype_mapping = {
            col: (
                sqlalchemy.types.Integer if pd.api.types.is_integer_dtype(df[col]) else
                sqlalchemy.types.Float if pd.api.types.is_float_dtype(df[col]) else
                sqlalchemy.types.DateTime if pd.api.types.is_datetime64_any_dtype(df[col]) else
                sqlalchemy.types.Boolean if pd.api.types.is_bool_dtype(df[col]) else
                sqlalchemy.types.Text(collation="utf8mb4_unicode_ci") if pd.api.types.is_string_dtype(df[col]) else
                sqlalchemy.types.String(255, collation="utf8mb4_unicode_ci")
            )
            for col in df.columns
        }

        df.to_sql(table_name, engine, if_exists=if_exists, index=False, dtype=dtype_mapping)

        print(f"✅ Data successfully uploaded to `{table_name}` with correct column types!")

    except Exception as e:
        print(f"❌ Error uploading data to `{table_name}`: {e}")

# Example usage:
# engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")
upload_to_mysql(merged_df2, 'report_campaign_region', engine)

Connected to MySQL successfully!
✅ Data successfully uploaded to `report_campaign_region` with correct column types!


# Creative

In [257]:
df_tt_creative.rename(columns={'tt_preview_url': 'thumbnail_url'}, inplace=True)

In [258]:
df_creative = pd.concat([df_fb_creative, df_gg_creative, df_tt_creative], ignore_index=True)

In [259]:
import pandas as pd

def process_dataframe(df, column_mapping, numeric_cols, extra_processing=None, column_order=None):
    """General function to process dataframes."""
    # Select and rename columns
    existing_columns = [col for col in column_mapping if col in df.columns]
    df = df[existing_columns].rename(columns=column_mapping).copy()
    
    # Convert 'Date' column
    df['report_date'] = pd.to_datetime(df['report_date'], errors='coerce')
    df['report_date'] = pd.to_datetime(df['report_date'], errors='coerce').dt.date

    # Convert numeric columns
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Apply dataset-specific processing
    if extra_processing:
        extra_processing(df)
    
    # Ensure all expected columns exist
    if column_order:
        for col in column_order:
            if col not in df.columns:
                df[col] = None
        df = df[column_order]
    
    return df


# Define column mappings & numeric columns for each dataset
youtube_mapping = {'report_date': 'report_date',
    'ad_id': 'ad_id',
    'Category': 'Category',
    'Funnel': 'Funnel',
    'Platform': 'Platform',
    'Brand Name': 'Brand',
    'placement': 'Video_Range',
    'creative_type': 'Creative_Type',
    'creative_format': 'Creative_Format',
    'creative_length': 'Creative_Length',
    'buying_model': 'Buying_Method',
    'TA': 'TA',
    'Campaign phase': 'Campaign_phase',
    'Audience group': 'Audience_group',
    'Region': 'Region',
    'campaign_name': 'Campaign_Name',
    'camp_code': 'Campaign_code',
    'Campaign Name': 'Campaign',
    'Plan Start Date': 'Plan_Start_Date',
    'Plan End Date': 'Plan_End_Date',
    'Actual Start Date': 'Actual_Start_Date',
    'Actual End Date': 'Actual_End_Date',
    'freetext_2': 'freetext_2',
    'freetext_3': 'freetext_3',
    'camp_type': 'Format',
    'impressions': 'Impression',
    'clicks': 'Clicks',
    'views': 'Views',
    'currency_code': 'Currency_Code',
    'Cost': 'Cost',
    'engagements': 'Engagements',
    'video_25': 'Video_Plays_25',
    'video_50': 'Video_Plays_50',
    'video_75': 'Video_Plays_75',
    'video_100': 'Video_Plays_100'}  
tiktok_mapping = {'report_date': 'report_date',
    'ad_id': 'ad_id',
    'Category': 'Category',
    'Funnel': 'Funnel',
    'Platform': 'Platform',
    'Brand Name': 'Brand',
    'TA': 'TA',
    'Campaign phase': 'Campaign_phase',
    'Audience group': 'Audience_group',
    'Region': 'Region',
    'camp_type': 'Format',
    'creative_type': 'Creative_Type',
    'creative_format': 'Creative_Format',
    'creative_length': 'Creative_Length',
    'buying_model': 'Buying_Method',
    'campaign_name': 'Campaign_Name',
    'camp_code': 'Campaign_code',
    'Campaign Name': 'Campaign',
    'Plan Start Date': 'Plan_Start_Date',
    'Plan End Date': 'Plan_End_Date',
    'Actual Start Date': 'Actual_Start_Date',
    'Actual End Date': 'Actual_End_Date',
    'freetext_2': 'freetext_2',
    'freetext_3': 'freetext_3',
    'spend': 'Cost',
    'impressions': 'Impression',
    'clicks_destination': 'Clicks',
    'engagements': 'Engagements',
    '2_second_video_views': '23s_Video_Views',
    '6_second_video_views': 'Views',
    'engaged_view': '6s_Engaged_Views',
    'video_views_p25': 'Video_Plays_25',
    'video_views_p50': 'Video_Plays_50',
    'video_views_p75': 'Video_Plays_75',
    'video_views_p100': 'Video_Plays_100'}
facebook_mapping = {'report_date': 'report_date',
    'ad_id': 'ad_id',
    'Category': 'Category',
    'Funnel': 'Funnel',
    'Platform': 'Platform',
    'Brand Name': 'Brand',
    'camp_type': 'Format',
    'creative_type': 'Creative_Type',
    'creative_format': 'Creative_Format',
    'creative_length': 'Creative_Length',
    'buying_model': 'Buying_Method',
    'TA': 'TA',
    'Campaign phase': 'Campaign_phase',
    'Audience group': 'Audience_group',
    'Region': 'Region',
    'campaign_name': 'Campaign_Name',
    'camp_code': 'Campaign_code',
    'Campaign Name': 'Campaign',
    'Plan Start Date': 'Plan_Start_Date',
    'Plan End Date': 'Plan_End_Date',
    'Actual Start Date': 'Actual_Start_Date',
    'Actual End Date': 'Actual_End_Date',
    'freetext_2': 'freetext_2',
    'freetext_3': 'freetext_3',
    'impression': 'Impression',
    'cost': 'Cost',
    'post_engagement': 'Engagements',
    'video_played_3': '23s_Video_Views',
    'thruplays': 'Views',
    'video_played_25': 'Video_Plays_25',
    'video_played_50': 'Video_Plays_50',
    'video_played_75': 'Video_Plays_75',
    'video_played_complete': 'Video_Plays_100',
    'link_click': 'Clicks'}

youtube_numeric_cols = ['Impression', 'Clicks', 'Views', 'Cost', 'Engagements', 
    'Video_Played_25', 'Video_Played_50', 'Video_Played_75', 
    'Video_Played_100', 'Conversions']
tiktok_numeric_cols = ['Cost', 'Impression', 'Clicks', 'Engagement',
    '2s_Video_Views', '6s_Video_Views', '6s_Engaged_Views',
   'Video_Played_25', 'Video_Played_50', 'Video_Played_75', 
    'Video_Played_100']
facebook_numeric_cols = ['Engagement', '3s_Video_Plays', 'Impression',
    'Cost', 'Clicks', 'Video_Played_25', 'Video_Played_50', 'Video_Played_75', 
    'Video_Played_100', 'Thruplays']


# Process datasets
report_campaign_creative_youtube = process_dataframe(df_yt_filter, youtube_mapping, youtube_numeric_cols)
report_campaign_creative_tiktok = process_dataframe(df_tt_filter, tiktok_mapping, tiktok_numeric_cols)
report_campaign_creative_facebook = process_dataframe(df_fb_filter, facebook_mapping, facebook_numeric_cols)


# report_campaign_overall_youtube['Platform'] = 'YouTube'
# report_campaign_overall_tiktok['Platform'] = 'TikTok'
# report_campaign_overall_facebook['Platform'] = 'Facebook'

report_campaign_creative_total = pd.concat([report_campaign_creative_youtube, report_campaign_creative_tiktok, report_campaign_creative_facebook], ignore_index=True)
report_campaign_creative_total['Content'] = report_campaign_creative_total['freetext_3'].str.split('-').str[0]

# ✅ Step 4: Summarize at higher level (Brand + Campaign)
summary_columns = ['report_date', 'ad_id', 'Campaign_phase', 'Category', 'Platform', 'Brand', 'Campaign', 'Format', 'Buying_Method', 'TA', 'Region', 'Plan_Start_Date', 'Plan_End_Date', 'Campaign_code', 'Funnel','Content', 'freetext_2', 'Creative_Type', 'Creative_Length', 'Creative_Format', 'Actual_Start_Date', 'Actual_End_Date', 'Audience_group']
metric_columns = ['Impression', 'Clicks', 'Views', 'Cost', 'Engagements', '23s_Video_Views',
                  'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 'Video_Plays_100']

# Filter only metrics that exist in the dataset
existing_metrics = [col for col in metric_columns if col in report_campaign_creative_total.columns]

# Group and aggregate
report_campaign_creative_summary = (
    report_campaign_creative_total
    .groupby(summary_columns)[existing_metrics]
    .sum()
    .reset_index()
)

report_campaign_creative_summary

Unnamed: 0,report_date,ad_id,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,...,Impression,Clicks,Views,Cost,Engagements,23s_Video_Views,Video_Plays_25,Video_Plays_50,Video_Plays_75,Video_Plays_100
0,2024-12-19,1820223593669633,Phase1,SN,TikTok,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
1,2024-12-19,1820223866106929,Phase1,SN,TikTok,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
2,2024-12-19,1820433171684386,Phase1,SN,TikTok,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
3,2024-12-19,1820433307201537,Phase1,SN,TikTok,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
4,2024-12-19,1820655731474449,Phase1,SN,TikTok,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209002,2025-07-05,1835886736685233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
209003,2025-07-05,1835886736686369,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
209004,2025-07-05,1835886736687233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
209005,2025-07-05,1835886736687249,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [260]:
# 1. Ensure the timestamp column is in datetime format
df_creative['last_update'] = pd.to_datetime(df_creative['last_update'], errors='coerce')

# 2. Sort by last_update (latest first)
df_creative = df_creative.sort_values(by='last_update', ascending=False)

# 3. Drop duplicates — keeping the most recent one
#    Define what makes a row a duplicate (e.g., 'ad_id' or a combination)
df_creative = df_creative.drop_duplicates(subset='ad_id', keep='first').reset_index(drop=True)

In [261]:
df_tt_creative_deduped = df_tt_creative.drop_duplicates(subset='ad_id')
# Merge thumbnail_url into the report dataframe based on ad_id
report_campaign_creative_summary = report_campaign_creative_summary.merge(
    df_creative[['ad_id', 'thumbnail_url']],
    how='left',
    on='ad_id'
)

In [262]:
report_campaign_creative_summary['Campaign_phase'] = report_campaign_creative_summary['Campaign_phase'].replace('NA', '')

In [263]:
report_campaign_creative_summary = report_campaign_creative_summary[
    (report_campaign_creative_summary["Impression"] != 0) & 
    (report_campaign_creative_summary["Cost"] != 0)
]

In [264]:
df_ga_creative_summary = df_ga.groupby([
    'report_date', 
    'Campaign', 
    'Category', 
    'Brand', 
    'Start_Date', 
    'Buying Method', 
    'Region', 
    'Platform', 
    'TA',
    'Content Angle'
])[[
    'sessions', 
    'engaged_sessions', 
    'add_to_carts', 
    'purchase_revenue', 
    'ecommerce_purchases', 
    'screen_page_views'
]].sum().reset_index()

df_ga_creative_summary = df_ga_creative_summary.rename(columns={
    'Start_Date': 'Actual_Start_Date',
    'Buying Method': 'Buying_Method',
    'Content Angle': 'Content'
})
df_ga_creative_summary


KeyError: 'Campaign'

In [265]:
import pandas as pd

merge_cols = ['Campaign', 'report_date', 'Category', 'Platform', 'Brand', 'TA', 'Buying_Method', 'Content']

# --- Normalize values (but return a key Series instead of modifying DataFrame) ---
def generate_merge_keys(df, cols):
    norm_df = df[cols].copy()
    for col in cols:
        if pd.api.types.is_datetime64_any_dtype(norm_df[col]):
            norm_df[col] = pd.to_datetime(norm_df[col]).dt.strftime('%Y-%m-%d')
        else:
            norm_df[col] = (
                norm_df[col].astype(str)
                .str.strip().str.lower()
                .str.replace(r'\s+', '', regex=True)
            )
    return norm_df.agg('_'.join, axis=1)

# --- Generate merge keys without modifying original DataFrames ---
df_ga_creative_summary = df_ga_creative_summary.copy()
report_campaign_creative_summary = report_campaign_creative_summary.copy()

df_ga_creative_summary['merge_key'] = generate_merge_keys(df_ga_creative_summary, merge_cols)
report_campaign_creative_summary['merge_key'] = generate_merge_keys(report_campaign_creative_summary, merge_cols)

# --- Build lookup and track used keys ---
ga_lookup = df_ga_creative_summary.set_index('merge_key')
used_keys = set()
matched_ga_rows = []

# --- Match one-by-one, use GA row only once ---
for idx, row in report_campaign_creative_summary.iterrows():
    key = row['merge_key']
    if key in ga_lookup.index and key not in used_keys:
        ga_row = ga_lookup.loc[key]
        if isinstance(ga_row, pd.DataFrame):
            ga_row = ga_row.iloc[0]
        matched_ga_rows.append(ga_row)
        used_keys.add(key)
    else:
        matched_ga_rows.append(pd.Series([pd.NA] * df_ga_creative_summary.shape[1], index=df_ga_creative_summary.columns))

# --- Combine result ---
matched_ga_df = pd.DataFrame(matched_ga_rows).reset_index(drop=True)
merged_df_creative = pd.concat([report_campaign_creative_summary.reset_index(drop=True), matched_ga_df], axis=1)

# --- Drop only merge_key ---
merged_df_creative.drop(columns=['merge_key'], inplace=True, errors='ignore')

# --- Final output ---
merged_df_creative

Unnamed: 0,report_date,ad_id,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,...,Region,Platform.1,TA.1,Content,sessions,engaged_sessions,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views
0,2025-01-03,120215206921730524,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,...,,,,,,,,,,
1,2025-01-04,727813359583,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,,,,,,,,,,
2,2025-01-04,727836461242,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,,,,,,,,,,
3,2025-01-04,727836461245,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,,,,,,,,,,
4,2025-01-04,727903310384,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19453,2025-06-27,1835886736685233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,,,,,,,,,,
19454,2025-06-27,1835886736686369,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,,,,,,,,,,
19455,2025-06-27,1835886736687233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,,,,,,,,,,
19456,2025-06-27,1835886736687249,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,,,,,,,,,,


In [266]:
merged_df_creative = merged_df_creative.loc[:, ~merged_df_creative.columns.duplicated()]

In [267]:
merged_df_creative['Audience'] = merged_df_creative['Audience_group'].astype(str) + '\n' + merged_df_creative['TA'].astype(str)
merged_df_creative['ad'] = (
    merged_df_creative['Creative_Type'].fillna('').astype(str) + ' ' +
    merged_df_creative['Creative_Format'].fillna('').astype(str) + ' ' +
    merged_df_creative['Creative_Length'].fillna('').astype(str) + ' ' +
    merged_df_creative['Content'].fillna('').astype(str)
).str.strip()
merged_df_creative

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_creative['Audience'] = merged_df_creative['Audience_group'].astype(str) + '\n' + merged_df_creative['TA'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_creative['ad'] = (


Unnamed: 0,report_date,ad_id,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,...,Video_Plays_100,thumbnail_url,sessions,engaged_sessions,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views,Audience,ad
0,2025-01-03,120215206921730524,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,...,0.0000,https://external-sin11-2.xx.fbcdn.net/emg1/v/t...,,,,,,,Group1\nF25-44,IMAGE VER NA 100%DealHời
1,2025-01-04,727813359583,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,89523.5000,,,,,,,,MIXED\nF25-44,VID VER 15S DEALNÀY100%HOT
2,2025-01-04,727836461242,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,64916.4000,,,,,,,,MIXED\nF25-44,VID VER 6S DEALNÀY100%HOT
3,2025-01-04,727836461245,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,87189.6000,,,,,,,,MIXED\nF25-44,VID VER 6S DEALNÀY100%HOT
4,2025-01-04,727903310384,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,47.2941,,,,,,,,MIXED\nF25-44,VID VER 15S DEALNÀY100%HOT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19453,2025-06-27,1835886736685233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,10.0000,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,,,,,,MIXED\nP18-54,VID VER 60S Ăn Gì Thương Ơi Clip2
19454,2025-06-27,1835886736686369,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,0.0000,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,,,,,,MIXED\nP18-54,VID VER 60S Bùi Khánh Hà
19455,2025-06-27,1835886736687233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,2.0000,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,,,,,,MIXED\nP18-54,VID VER 60S Linh Anh
19456,2025-06-27,1835886736687249,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,0.0000,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,,,,,,MIXED\nP18-54,VID VER 60S Nghi Xu


In [268]:
mask = (
    (merged_df_creative['Campaign'] == 'ADMMVLaunch') &
    (merged_df_creative['Platform'] == 'TikTok') &
    (merged_df_creative['Content'] == 'AUC')
)

merged_df_creative.loc[mask, 'Content'] = 'NewPack'

In [269]:
mask = (
    (merged_df_creative['Campaign'] == 'ChuoiDuaPromoAprMay') &
    (merged_df_creative['Platform'] == 'TikTok') &
    (merged_df_creative['Content'] == 'VID')
)

merged_df_creative.loc[mask, 'Content'] = 'ChuoiDua'

In [270]:
merged_df_creative

Unnamed: 0,report_date,ad_id,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,...,Video_Plays_100,thumbnail_url,sessions,engaged_sessions,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views,Audience,ad
0,2025-01-03,120215206921730524,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,...,0.0000,https://external-sin11-2.xx.fbcdn.net/emg1/v/t...,,,,,,,Group1\nF25-44,IMAGE VER NA 100%DealHời
1,2025-01-04,727813359583,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,89523.5000,,,,,,,,MIXED\nF25-44,VID VER 15S DEALNÀY100%HOT
2,2025-01-04,727836461242,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,64916.4000,,,,,,,,MIXED\nF25-44,VID VER 6S DEALNÀY100%HOT
3,2025-01-04,727836461245,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,87189.6000,,,,,,,,MIXED\nF25-44,VID VER 6S DEALNÀY100%HOT
4,2025-01-04,727903310384,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,47.2941,,,,,,,,MIXED\nF25-44,VID VER 15S DEALNÀY100%HOT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19453,2025-06-27,1835886736685233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,10.0000,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,,,,,,MIXED\nP18-54,VID VER 60S Ăn Gì Thương Ơi Clip2
19454,2025-06-27,1835886736686369,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,0.0000,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,,,,,,MIXED\nP18-54,VID VER 60S Bùi Khánh Hà
19455,2025-06-27,1835886736687233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,2.0000,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,,,,,,MIXED\nP18-54,VID VER 60S Linh Anh
19456,2025-06-27,1835886736687249,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,0.0000,http://p0.ipstatp.com/origin/tos-alisg-p-0051c...,,,,,,,MIXED\nP18-54,VID VER 60S Nghi Xu


In [271]:
import pandas as pd

merge_cols = ['Campaign', 'Funnel', 'Category', 'Region', 'Platform', 'Brand', 'TA', 'Buying_Method', 'Plan_Start_Date', 'Actual_Start_Date', 'Audience_group']

# --- Refined Normalization ---
def normalize_cols(df, cols, prefix):
    df = df.copy()
    for col in cols:
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            df[f'{prefix}_{col}'] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')
        else:
            df[f'{prefix}_{col}'] = df[col].astype(str).str.strip().str.lower().str.replace(r'\s+', '', regex=True)
    return df

# --- Normalize ---
df_plan = normalize_cols(df_plan, merge_cols, 'plan')
merged_df3 = normalize_cols(merged_df_creative, merge_cols, 'merge')

# --- Select only the normalized keys + CTR columns ---
plan_keys = [f'plan_{col}' for col in merge_cols]
ctr_cols = [col for col in df_plan.columns if col.endswith('_bm')]
df_plan_ctr = df_plan[plan_keys + ctr_cols]

# --- Merge ---
merged_df3 = pd.merge(
    merged_df3,
    df_plan_ctr,
    how='left',
    left_on=[f'merge_{col}' for col in merge_cols],
    right_on=plan_keys
)

# --- Drop helper columns ---
merged_df3 = merged_df3.drop(
    columns=[f'merge_{col}' for col in merge_cols] + plan_keys + [f'total_{col}' for col in merge_cols],
    errors='ignore'
)

merged_df3

Unnamed: 0,report_date,ad_id,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,...,engaged_sessions,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views,Audience,ad,er_bm,vtr_bm,ctr_bm
0,2025-01-03,120215206921730524,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,...,,,,,,Group1\nF25-44,IMAGE VER NA 100%DealHời,,,
1,2025-01-04,727813359583,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,,,,,,MIXED\nF25-44,VID VER 15S DEALNÀY100%HOT,,,
2,2025-01-04,727836461242,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,,,,,,MIXED\nF25-44,VID VER 6S DEALNÀY100%HOT,,,
3,2025-01-04,727836461245,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,,,,,,MIXED\nF25-44,VID VER 6S DEALNÀY100%HOT,,,
4,2025-01-04,727903310384,Phase1,SN,YouTube,FM100,FM100PromoSustainJan,View Reach Campaign,CPM,F25-44,...,,,,,,MIXED\nF25-44,VID VER 15S DEALNÀY100%HOT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20573,2025-06-27,1835886736685233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,,,,,,MIXED\nP18-54,VID VER 60S Ăn Gì Thương Ơi Clip2,,,0.017078
20574,2025-06-27,1835886736686369,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,,,,,,MIXED\nP18-54,VID VER 60S Bùi Khánh Hà,,,0.017078
20575,2025-06-27,1835886736687233,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,,,,,,MIXED\nP18-54,VID VER 60S Linh Anh,,,0.017078
20576,2025-06-27,1835886736687249,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,...,,,,,,MIXED\nP18-54,VID VER 60S Nghi Xu,,,0.017078


In [272]:
merged_df3 = merged_df3.loc[:, ~merged_df3.columns.str.startswith('ga_')]

In [273]:
# 1. Drop the 'ad_id' column
merged_df3 = merged_df3.drop(columns=['ad_id'], errors='ignore')

# 2. Identify non-numeric (grouping) and numeric (aggregation) columns
non_numeric_cols = merged_df3.select_dtypes(exclude='number').columns.tolist()
numeric_cols = merged_df3.select_dtypes(include='number').columns.tolist()

# 3. Group by non-numeric columns and sum numeric ones
merged_df3 = merged_df3.groupby(non_numeric_cols, dropna=False)[numeric_cols].sum().reset_index()

In [274]:
merged_df3

Unnamed: 0,report_date,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,Region,...,Cost,Engagements,23s_Video_Views,Video_Plays_25,Video_Plays_50,Video_Plays_75,Video_Plays_100,er_bm,vtr_bm,ctr_bm
0,2025-01-03,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,143643.0,29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,12798400.0,3969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,71466.0,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,1528260.0,13064,12954.0,9646.0,3763.0,2075.0,1374.0,0.0,0.0,0.000000
4,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,2314080.0,22207,22010.0,63793.0,22017.0,12864.0,8635.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6313,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,42783.0,27,260.0,40.0,26.0,11.0,6.0,0.0,0.0,0.034155
6314,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,24865.0,10,98.0,5.0,2.0,1.0,1.0,0.0,0.0,0.034155
6315,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,139754.0,54,367.0,46.0,12.0,9.0,7.0,0.0,0.0,0.119544
6316,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,267755.0,60,587.0,42.0,23.0,16.0,9.0,0.0,0.0,0.068311


In [306]:
# Establish MySQL connection
conn = mysql.connector.connect(
    host="125.212.245.36",
    port = '3306',
    user="dashboard_report_vinamilk",
    password="6vtRhNxa79zPsfs5",
    database="dashboard_report_vinamilk"
)
cursor = conn.cursor()

df_benchmark = get_data(cursor, "mapping_creative_benchmark")


if 'cursor' in locals():
    cursor.close()
if 'conn' in locals() and conn.is_connected():
    conn.close()
    print("MySQL connection closed.")

MySQL connection closed.


In [307]:
df_benchmark.columns

Index(['Platform', 'Month', 'CATE', 'Buying_Model', 'Creative_Type',
       'Creative_Format', 'Creative_Length', 'Objective', 'TA', 'Cost_bm',
       'Impression_bm', 'Views_bm', '2s3s_Video_Views_bm', 'Engagements_bm',
       'Clicks_bm', 'CPM_mtm_bm', 'CPV_mtm_bm', 'CPC_mtm_bm', 'CPE_mtm_bm',
       'VTR_mtm_bm', 'VTR23_mtm_bm', 'CTR_mtm_bm', 'ER_mtm_bm'],
      dtype='object')

In [299]:
merged_df3

Unnamed: 0,report_date,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,Region,...,23s_Video_Views,Video_Plays_25,Video_Plays_50,Video_Plays_75,Video_Plays_100,er_bm,vtr_bm,ctr_bm,Month,Benchmark_Month
0,2025-01-03,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,2025-01-03,Dec
1,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,2025-01-04,Dec
2,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,2025-01-04,Dec
3,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,12954.0,9646.0,3763.0,2075.0,1374.0,0.0,0.0,0.000000,2025-01-04,Dec
4,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,22010.0,63793.0,22017.0,12864.0,8635.0,0.0,0.0,0.000000,2025-01-04,Dec
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6313,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,260.0,40.0,26.0,11.0,6.0,0.0,0.0,0.034155,2025-06-27,May
6314,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,98.0,5.0,2.0,1.0,1.0,0.0,0.0,0.034155,2025-06-27,May
6315,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,367.0,46.0,12.0,9.0,7.0,0.0,0.0,0.119544,2025-06-27,May
6316,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,587.0,42.0,23.0,16.0,9.0,0.0,0.0,0.068311,2025-06-27,May


In [301]:
merged_df3.columns

Index(['report_date', 'Campaign_phase', 'Category', 'Platform', 'Brand',
       'Campaign', 'Format', 'Buying_Method', 'TA', 'Region',
       'Plan_Start_Date', 'Plan_End_Date', 'Campaign_code', 'Funnel',
       'Content', 'freetext_2', 'Creative_Type', 'Creative_Length',
       'Creative_Format', 'Actual_Start_Date', 'Actual_End_Date',
       'Audience_group', 'thumbnail_url', 'sessions', 'engaged_sessions',
       'add_to_carts', 'purchase_revenue', 'ecommerce_purchases',
       'screen_page_views', 'Audience', 'ad', 'Impression', 'Clicks', 'Views',
       'Cost', 'Engagements', '23s_Video_Views', 'Video_Plays_25',
       'Video_Plays_50', 'Video_Plays_75', 'Video_Plays_100', 'er_bm',
       'vtr_bm', 'ctr_bm', 'Month', 'Benchmark_Month'],
      dtype='object')

In [308]:
# 1. Convert merged_df3['report_date'] to datetime
merged_df3['Month'] = pd.to_datetime(merged_df3['report_date'])

# 2. Create Benchmark_Month by subtracting 1 month
merged_df3['Benchmark_Month'] = (merged_df3['Month'] - pd.DateOffset(months=1)).dt.strftime('%b')

# 3. Convert df_benchmark['Month'] from 3-letter string to datetime (with dummy year)
df_benchmark['Month'] = pd.to_datetime('2024-' + df_benchmark['Month'], format='%Y-%b')
df_benchmark['Month'] = df_benchmark['Month'].dt.strftime('%b')  # Convert back to string for matching

# 4. Filter benchmark to relevant columns
bm_cols = [col for col in df_benchmark.columns if col.endswith('_bm')]
keys = ['Month', 'Platform', 'CATE', 'Objective', 'Buying_Model', 'Creative_Type', 'Creative_Length', 'TA']
df_bm_filtered = df_benchmark[keys + bm_cols]

# 5. Merge
merged_result = pd.merge(
    merged_df3,
    df_bm_filtered,
    how='left',
    left_on=['Benchmark_Month', 'Platform', 'Category', 'Format', 'Buying_Method', 'Creative_Type', 'Creative_Length', 'TA'],
    right_on=['Month',           'Platform', 'CATE',     'Objective', 'Buying_Model', 'Creative_Type', 'Creative_Length', 'TA']
)

# 6. Drop 'Month' from benchmark side
merged_result

Unnamed: 0,report_date,Campaign_phase,Category,Platform,Brand,Campaign,Format,Buying_Method,TA,Region,...,Engagements_bm,Clicks_bm,CPM_mtm_bm,CPV_mtm_bm,CPC_mtm_bm,CPE_mtm_bm,VTR_mtm_bm,VTR23_mtm_bm,CTR_mtm_bm,ER_mtm_bm
0,2025-01-03,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
1,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
2,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
3,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
4,2025-01-04,Phase1,SN,Facebook,FM100,FM100PromoSustainJan,Reach,CPM,F25-44,NAT,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6967,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,0.0,32478.0,54959.9,712.47,2585.12,,0.07714,0.22811,0.02126,0.0
6968,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,0.0,32478.0,54959.9,712.47,2585.12,,0.07714,0.22811,0.02126,0.0
6969,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,0.0,32478.0,54959.9,712.47,2585.12,,0.07714,0.22811,0.02126,0.0
6970,2025-06-27,,SN,TikTok,FM100,B2BFM100FlavorAprJun,Video Shopping Ads,CPC,P18-54,15C,...,0.0,32478.0,54959.9,712.47,2585.12,,0.07714,0.22811,0.02126,0.0


In [309]:
from sqlalchemy import create_engine
import pandas as pd

DB_TYPE = 'mysql'  # Change to 'postgresql' for PostgreSQL
DB_HOST = '125.212.245.36'
DB_PORT = '3306'  # Change for different databases
DB_USER = 'dashboard_report_vinamilk'
DB_PASS = '6vtRhNxa79zPsfs5'
DB_NAME = 'dashboard_report_vinamilk'

try:
    conn = mysql.connector.connect(
        host=DB_HOST,
        user=DB_USER,
        password=DB_PASS,
        database=DB_NAME,
        charset="utf8mb4"
    )
    
    if conn.is_connected():
        cursor = conn.cursor()
        print("Connected to MySQL successfully!")
    else:
        print("Failed to connect.")

except mysql.connector.Error as e:
    print(f"Error: {e}")

# Create SQLAlchemy engine
cursor = conn.cursor()
cursor.execute("SET NAMES utf8mb4;") 
cursor.execute("SET CHARACTER SET utf8mb4;")
engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

def upload_to_mysql(df, table_name, engine, if_exists='replace'):
    """
    Uploads a DataFrame to a MySQL table with correct column types based on the DataFrame.

    :param df: Pandas DataFrame to upload
    :param table_name: Name of the table in the database
    :param engine: SQLAlchemy engine object
    :param if_exists: What to do if the table already exists ('fail', 'replace', 'append')
    """
    try:
        with engine.connect() as conn:
            metadata = MetaData()

            # Define table schema based on DataFrame's column types
            columns = []

            for col in df.columns:
                dtype = df[col].dtype
                
                # Map Pandas dtype to MySQL/SQLAlchemy types
                if pd.api.types.is_integer_dtype(dtype):
                    col_type = Integer
                elif pd.api.types.is_float_dtype(dtype):
                    col_type = Float
                elif pd.api.types.is_datetime64_any_dtype(dtype):
                    col_type = DateTime
                elif pd.api.types.is_bool_dtype(dtype):
                    col_type = Boolean
                elif pd.api.types.is_string_dtype(dtype):
                    col_type = Text(collation="utf8mb4_unicode_ci")  # UTF-8 support
                else:
                    col_type = String(255, collation="utf8mb4_unicode_ci")  # Default fallback

                columns.append(Column(col, col_type))

            table = Table(table_name, metadata, *columns, extend_existing=True)

            # Create the table with utf8mb4 encoding
            metadata.create_all(conn)

        # Convert object columns to string before uploading
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].astype(str)

        # Upload DataFrame to MySQL with proper type mapping
        dtype_mapping = {
            col: (
                sqlalchemy.types.Integer if pd.api.types.is_integer_dtype(df[col]) else
                sqlalchemy.types.Float if pd.api.types.is_float_dtype(df[col]) else
                sqlalchemy.types.DateTime if pd.api.types.is_datetime64_any_dtype(df[col]) else
                sqlalchemy.types.Boolean if pd.api.types.is_bool_dtype(df[col]) else
                sqlalchemy.types.Text(collation="utf8mb4_unicode_ci") if pd.api.types.is_string_dtype(df[col]) else
                sqlalchemy.types.String(255, collation="utf8mb4_unicode_ci")
            )
            for col in df.columns
        }

        df.to_sql(table_name, engine, if_exists=if_exists, index=False, dtype=dtype_mapping)

        print(f"✅ Data successfully uploaded to `{table_name}` with correct column types!")

    except Exception as e:
        print(f"❌ Error uploading data to `{table_name}`: {e}")

# Example usage:
# engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")
upload_to_mysql(merged_result, 'report_campaign_creative', engine)

Connected to MySQL successfully!
✅ Data successfully uploaded to `report_campaign_creative` with correct column types!


# Overall Report

### Reach

In [126]:
df_reach_weekly = pd.read_excel('/Users/khanhvx/Downloads/Reach SN (1).xlsx', sheet_name='weekly')
df_reach_utd= pd.read_excel('/Users/khanhvx/Downloads/Reach SN (1).xlsx', sheet_name='total')


df_map_camp = pd.read_excel('/Users/khanhvx/Downloads/map sn.xlsx', sheet_name='campaign')
df_map_ad = pd.read_excel('/Users/khanhvx/Downloads/map sn.xlsx', sheet_name='ad')
df_map_camp = df_map_camp.drop_duplicates(subset='campaign_id', keep='last')
df_map_ad = df_map_ad.drop_duplicates(subset='ad_id', keep='last')


df_reach_weekly = df_reach_weekly.drop(columns=['campaign_name'], errors='ignore')
df_reach_weekly = df_reach_weekly.merge(df_map_camp[['campaign_id', 'campaign_name']], on='campaign_id', how='left')

df_reach_utd = df_reach_utd.drop(columns=['campaign_name'], errors='ignore')
df_reach_utd = df_reach_utd.merge(df_map_camp[['campaign_id', 'campaign_name']], on='campaign_id', how='left')

df_reach_weekly = process_campaign_data(df_reach_weekly)
df_reach_utd = process_campaign_data(df_reach_utd)

mapping_naming(df_reach_weekly)
mapping_naming(df_reach_utd)

df_reach_weekly['Audience'] = df_reach_weekly['Audience group'].astype(str) + '\n' + df_reach_weekly['TA'].astype(str)
df_reach_utd['Audience'] = df_reach_utd['Audience group'].astype(str) + '\n' + df_reach_utd['TA'].astype(str)

# Ensure datetime columns are in datetime format
df_reach_weekly['Plan Start Date'] = pd.to_datetime(df_reach_weekly['Plan Start Date'])
df_reach_weekly['Start day of week'] = pd.to_datetime(df_reach_weekly['Start day of week'])

# Compute difference in days
days_diff = (df_reach_weekly['Start day of week'] - df_reach_weekly['Plan Start Date']).dt.days

# Compute full weeks difference + 1 (as per formula)
df_reach_weekly['Week'] = (days_diff // 7) + 2

# Compute relative we
df_reach_weekly['campaign_id'] = df_reach_weekly['campaign_id'].astype(str)

In [158]:
conn = mysql.connector.connect(
    host="10.0.0.3",
    port = '3306',
    user="dashboard_report_vinamilk",
    password="6vtRhNxa79zPsfs5",
    database="dashboard_report_vinamilk"
)
cursor = conn.cursor()

df_reach_utd_tt = get_data(cursor, "tiktok_performance_utd_reach")
df_reach_weekly_tt = get_data(cursor, "tiktok_performance_weekly_reach")

df_reach_utd_yt = get_data(cursor, "googleads_performance_utd_reach")
df_reach_weekly_yt = get_data(cursor, "googleads_performance_weekly_reach")

df_reach_utd_fb = get_data(cursor, "facebook_performance_utd_reach")
df_reach_weekly_fb = get_data(cursor, "facebook_performance_weekly_reach")


if 'cursor' in locals():
    cursor.close()
if 'conn' in locals() and conn.is_connected():
    conn.close()
    print("MySQL connection closed.")


df_reach_weekly = pd.concat([df_reach_weekly_tt, df_reach_weekly_yt, df_reach_weekly_fb], ignore_index=True)
df_reach_utd= pd.concat([df_reach_utd_tt, df_reach_utd_yt, df_reach_utd_fb], ignore_index=True)


df_map_camp = pd.read_excel('/Users/khanhvx/Downloads/map sn.xlsx', sheet_name='campaign')
df_map_ad = pd.read_excel('/Users/khanhvx/Downloads/map sn.xlsx', sheet_name='ad')
df_map_camp = df_map_camp.drop_duplicates(subset='campaign_id', keep='last')
df_map_ad = df_map_ad.drop_duplicates(subset='ad_id', keep='last')


df_reach_weekly = df_reach_weekly.drop(columns=['campaign_name'], errors='ignore')
df_reach_weekly = df_reach_weekly.merge(df_map_camp[['campaign_id', 'campaign_name']], on='campaign_id', how='left')

df_reach_utd = df_reach_utd.drop(columns=['campaign_name'], errors='ignore')
df_reach_utd = df_reach_utd.merge(df_map_camp[['campaign_id', 'campaign_name']], on='campaign_id', how='left')

df_reach_weekly = process_campaign_data(df_reach_weekly)
df_reach_utd = process_campaign_data(df_reach_utd)

mapping_naming(df_reach_weekly)
mapping_naming(df_reach_utd)

df_reach_weekly['Audience'] = df_reach_weekly['Audience group'].astype(str) + '\n' + df_reach_weekly['TA'].astype(str)
df_reach_utd['Audience'] = df_reach_utd['Audience group'].astype(str) + '\n' + df_reach_utd['TA'].astype(str)

# Ensure datetime columns are in datetime format
df_reach_weekly['Plan Start Date'] = pd.to_datetime(df_reach_weekly['Plan Start Date'])
df_reach_weekly['report_date'] = pd.to_datetime(df_reach_weekly['report_date'])

# Compute difference in days
days_diff = (df_reach_weekly['report_date'] - df_reach_weekly['Plan Start Date']).dt.days

# Compute full weeks difference + 1 (as per formula)
df_reach_weekly['Week'] = (days_diff // 7) + 2

# Compute relative we
df_reach_weekly['campaign_id'] = df_reach_weekly['campaign_id'].astype(str)
df_reach_utd['campaign_id'] = df_reach_utd['campaign_id'].astype(str)

MySQL connection closed.


In [159]:
df_reach_weekly

Unnamed: 0,customer_id,campaign,campaign_id,reach,impressions,start_date,end_date,report_date,campaign_name,Funnel,...,Age,Region,Device,Os,TA,Actual Start Date,Actual End Date,Platform,Audience,Week
0,7451448453630033921,CVR-SN-FM100-B2BFM100FlavorApr-Jun-1805-1805_T...,1832070549374002,0,0,2025-05-18,2025-05-18,2025-06-15,CVR_SN-FM100-B2BFM100FlavorAprJun-1004-3006_TT...,Conversion,...,1854,15C,AllDV,AllOS,P18-54,2025-05-18,2025-05-18,TikTok,MIXED\nP18-54,11.0
1,7451448453630033921,CVR-SN-FM100-B2BFM100FlavorApr-Jun-1805-1805_T...,1832070426331249,0,0,2025-05-18,2025-05-18,2025-06-15,CVR_SN-FM100-B2BFM100FlavorAprJun-1004-3006_TT...,Conversion,...,1854,15C,AllDV,AllOS,P18-54,2025-05-18,2025-05-18,TikTok,MIXED\nP18-54,11.0
2,7451448453630033921,CVR-SN-FM100-B2BFM100FlavorApr-Jun-1705-1705_T...,1831368625496273,0,0,2025-05-17,2025-05-17,2025-06-15,CVR_SN-FM100-B2BFM100FlavorAprJun-1004-3006_TT...,Conversion,...,1854,15C,AllDV,AllOS,P18-54,2025-05-17,2025-05-17,TikTok,MIXED\nP18-54,11.0
3,7451448453630033921,CVR-SN-FM100-B2BFM100FlavorApr-Jun-1705-1705_T...,1831368618739761,0,0,2025-05-17,2025-05-17,2025-06-15,CVR_SN-FM100-B2BFM100FlavorAprJun-1004-3006_TT...,Conversion,...,1854,15C,AllDV,AllOS,P18-54,2025-05-17,2025-05-17,TikTok,MIXED\nP18-54,11.0
4,7451448453630033921,CVR-SN-FM100-B2BFM100FlavorApr-Jun-1605-1605_T...,1831368080403537,0,0,2025-05-16,2025-05-16,2025-06-15,CVR_SN-FM100-B2BFM100FlavorAprJun-1004-3006_TT...,Conversion,...,1854,15C,AllDV,AllOS,P18-54,2025-05-16,2025-05-16,TikTok,MIXED\nP18-54,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,1119191339584807,AW-SBTE-DG-SocialAWO-1306-2606_FB_ENG_AUC_PRO-...,120227040751580772,56625,72931,2025-06-09,2025-06-15,2025-06-15,,,...,0000,,,,NA00-00,NaT,NaT,,\nNA00-00,
604,1119191339584807,AW-SBTE-DG-SocialAWO-1006-2406_FB_ENG_AUC_PRO-...,120227384881180772,90232,154771,2025-06-09,2025-06-15,2025-06-15,,,...,0000,,,,NA00-00,NaT,NaT,,\nNA00-00,
605,1119191339584807,AW-SBTE-DG-SocialAWO-1006-2406_FB_ENG_AUC_PRO-...,120227384881210772,66168,128022,2025-06-09,2025-06-15,2025-06-15,,,...,0000,,,,NA00-00,NaT,NaT,,\nNA00-00,
606,1085937483006669,AW_STV-Nut Milk-NutMilkSustainJun-0906-0907_FB...,120225074045350724,2094517,5569107,2025-06-09,2025-06-15,2025-06-15,,,...,0000,,,,NA00-00,NaT,NaT,,\nNA00-00,


In [138]:
# Load the CSV
app_install = pd.read_csv('/Users/khanhvx/Downloads/App install.csv')

# Rename the column
app_install = app_install.rename(columns={'Campaign ID': 'campaign_id'})

# Group by campaign_id and sum Installations
install_summary = app_install.groupby('campaign_id', as_index=False)['Installations'].sum()
install_summary['campaign_id'] = install_summary['campaign_id'].astype(str)

In [139]:

import pandas as pd
from datetime import timedelta

def detect_inactive_campaigns_by_date(df):
    """
    Detects inactive campaigns for each unique report_date in the dataset.
    A campaign is considered 'Done' on a given report_date if it had
    zero cost, impressions, and clicks in the 7 days prior to that date.
    
    Returns a DataFrame with 'check_date', 'campaign_id', and 'Status'.
    """
    df = df.copy()
    df['report_date'] = pd.to_datetime(df['report_date'])
    
    results = []
    all_campaigns = df['campaign_id'].unique()

    for check_date in sorted(df['report_date'].unique()):
        # Define time window: 7 days before check_date
        start_date = check_date - timedelta(days=7)
        recent = df[(df['report_date'] >= start_date) & (df['report_date'] < check_date)]
        
        # Sum metrics by campaign
        sums = (recent
                .groupby('campaign_id')[['cost','impressions','clicks']]
                .sum()
                .reset_index())

        # Include all campaigns
        merged = pd.DataFrame({'campaign_id': all_campaigns})
        merged = (merged
                  .merge(sums, on='campaign_id', how='left')
                  .fillna(0))

        # Determine status
        merged['Status'] = merged.apply(
            lambda row: 'Done' if (row[['cost','impressions','clicks']] == 0).all() 
                        else 'On Going',
            axis=1
        )

        merged['check_date'] = check_date.strftime('%Y-%m-%d')
        results.append(merged[['check_date', 'campaign_id', 'Status']])
    
    return pd.concat(results, ignore_index=True)


df_total_filter = pd.concat([df_yt_filter, df_fb_filter, df_tt_filter], ignore_index=True)
inactive_campaigns = detect_inactive_campaigns_by_date(df_total_filter)
inactive_campaigns

Unnamed: 0,check_date,campaign_id,Status
0,2024-12-19,22139891518,Done
1,2024-12-19,22397445341,Done
2,2024-12-19,22237988393,Done
3,2024-12-19,22316464471,Done
4,2024-12-19,22194177791,Done
...,...,...,...
95740,2025-07-01,1836148490518658,Done
95741,2025-07-01,1836349222702193,Done
95742,2025-07-01,1836348999313490,Done
95743,2025-07-01,1835978194641938,On Going


In [140]:
import pandas as pd

def get_latest_active_date_per_campaign(df):
    # Ensure report_date is datetime
    df['report_date'] = pd.to_datetime(df['report_date'])

    # Filter rows with non-zero metrics
    active_df = df[(df['cost'] != 0) & 
                   (df['impressions'] != 0) & 
                   (df['clicks'] != 0)]

    # Group by camp_code and get the latest active date
    result_df = (active_df.groupby('camp_code')['report_date']
                          .max()
                          .reset_index()
                          .rename(columns={'report_date': 'latest_active_date'}))
    
    return result_df


latest_active_df = get_latest_active_date_per_campaign(df_total_filter)
latest_active_df

Unnamed: 0,camp_code,latest_active_date
0,,2025-06-28
1,CPAS,2025-03-16
2,CVR-AlLCate-DTC-AWO-Livestream-1701-3112,2025-04-11
3,Các Miền Còn Lại,2025-05-16
4,Dielac,2025-04-26
...,...,...
60,SN-SữaĐậuNành-SocialAWOApr-2904-0805,2025-05-08
61,STV-Nut Milk-NutMilkSustainJun-0906-0907,2025-07-01
62,STV-Soy Milk-SoyMilkLaunchingJun-1306-1008,2025-07-01
63,TIKTOK,2025-04-04


In [141]:
import pandas as pd

def process_dataframe(df, column_mapping, numeric_cols, extra_processing=None, column_order=None):
    """General function to process dataframes."""
    # Select and rename columns
    existing_columns = [col for col in column_mapping if col in df.columns]
    df = df[existing_columns].rename(columns=column_mapping).copy()
    
    # Convert 'Date' column
    df['report_date'] = pd.to_datetime(df['report_date'], errors='coerce')
    df['report_date'] = pd.to_datetime(df['report_date'], errors='coerce').dt.date

    # Convert numeric columns
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Apply dataset-specific processing
    if extra_processing:
        extra_processing(df)
    
    # Ensure all expected columns exist
    if column_order:
        for col in column_order:
            if col not in df.columns:
                df[col] = None
        df = df[column_order]
    
    return df


# Define column mappings & numeric columns for each dataset
youtube_mapping = {'report_date': 'report_date',
    'campaign_id': 'campaign_id',
    'Category': 'Category',
    'Funnel': 'Funnel',
    'Platform': 'Platform',
    'Brand Name': 'Brand',
    'placement': 'Video_Range',
    'buying_model': 'Buying_Method',
    'TA': 'TA',
    'Campaign phase': 'Campaign_phase',
    'Audience group': 'Audience_group',
    'Region': 'Region',
    'campaign_name': 'Campaign_Name',
    'camp_code': 'Campaign_code',
    'Campaign Name': 'Campaign',
    'Plan Start Date': 'Plan_Start_Date',
    'Plan End Date': 'Plan_End_Date',
    'Actual Start Date': 'Actual_Start_Date',
    'Actual End Date': 'Actual_End_Date',
    'freetext_2': 'freetext_2',
    'camp_type': 'Format',
    'impressions': 'Impression',
    'clicks': 'Clicks',
    'views': 'Views',
    'currency_code': 'Currency_Code',
    'Cost': 'Cost',
    'engagements': 'Engagements',
    'video_25': 'Video_Plays_25',
    'video_50': 'Video_Plays_50',
    'video_75': 'Video_Plays_75',
    'video_100': 'Video_Plays_100'}  
tiktok_mapping = {'report_date': 'report_date',
    'campaign_id': 'campaign_id',
    'Category': 'Category',
    'Funnel': 'Funnel',
    'Platform': 'Platform',
    'Brand Name': 'Brand',
    'TA': 'TA',
    'Campaign phase': 'Campaign_phase',
    'Audience group': 'Audience_group',
    'Region': 'Region',
    'camp_type': 'Format',
    'buying_model': 'Buying_Method',
    'campaign_name': 'Campaign_Name',
    'camp_code': 'Campaign_code',
    'Campaign Name': 'Campaign',
    'Plan Start Date': 'Plan_Start_Date',
    'Plan End Date': 'Plan_End_Date',
    'Actual Start Date': 'Actual_Start_Date',
    'Actual End Date': 'Actual_End_Date',
    'freetext_2': 'freetext_2',
    'spend': 'Cost',
    'impressions': 'Impression',
    'clicks_destination': 'Clicks',
    'engagements': 'Engagements',
    'profile_visits': 'profile_visits',
    '2_second_video_views': '23s_Video_Views',
    '6_second_video_views': 'Views',
    'engaged_view': '6s_Engaged_Views',
    'video_views_p25': 'Video_Plays_25',
    'video_views_p50': 'Video_Plays_50',
    'video_views_p75': 'Video_Plays_75',
    'video_views_p100': 'Video_Plays_100'}
facebook_mapping = {'report_date': 'report_date',
    'campaign_id': 'campaign_id',
    'Category': 'Category',
    'Funnel': 'Funnel',
    'Platform': 'Platform',
    'Brand Name': 'Brand',
    'camp_type': 'Format',
    'buying_model': 'Buying_Method',
    'TA': 'TA',
    'Campaign phase': 'Campaign_phase',
    'Audience group': 'Audience_group',
    'Region': 'Region',
    'campaign_name': 'Campaign_Name',
    'camp_code': 'Campaign_code',
    'Campaign Name': 'Campaign',
    'Plan Start Date': 'Plan_Start_Date',
    'Plan End Date': 'Plan_End_Date',
    'Actual Start Date': 'Actual_Start_Date',
    'Actual End Date': 'Actual_End_Date',
    'freetext_2': 'freetext_2',
    'impression': 'Impression',
    'cost': 'Cost',
    'post_engagement': 'Engagements',
    'video_played_3': '23s_Video_Views',
    'thruplays': 'Views',
    'video_played_25': 'Video_Plays_25',
    'video_played_50': 'Video_Plays_50',
    'video_played_75': 'Video_Plays_75',
    'video_played_complete': 'Video_Plays_100',
    'link_click': 'Clicks'}

youtube_numeric_cols = ['Engagement', '23s_Video_Views', 'Impression',
    'Cost', 'Clicks', 'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 
    'Video_Plays_100', 'Views']
tiktok_numeric_cols = ['Engagement', '23s_Video_Views', 'Impression',
    'Cost', 'Clicks', 'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 
    'Video_Plays_100', 'Views']
facebook_numeric_cols = ['Engagement', '23s_Video_Views', 'Impression',
    'Cost', 'Clicks', 'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 
    'Video_Plays_100', 'Views']


# Process datasets
report_campaign_overall_youtube = process_dataframe(df_yt_filter, youtube_mapping, youtube_numeric_cols)
report_campaign_overall_tiktok = process_dataframe(df_tt_filter, tiktok_mapping, tiktok_numeric_cols)
report_campaign_overall_facebook = process_dataframe(df_fb_filter, facebook_mapping, facebook_numeric_cols)

report_campaign_overall_total = pd.concat([report_campaign_overall_youtube, report_campaign_overall_tiktok, report_campaign_overall_facebook], ignore_index=True)

# ✅ Step 4: Summarize at higher level (Brand + Campaign)z
summary_columns = ['report_date', 'campaign_id', 'Campaign_phase', 'Category', 'Platform', 'Brand', 'Campaign', 'Format', 'Buying_Method', 'TA', 'Region', 'Plan_Start_Date', 'Plan_End_Date', 'Campaign_code', 'Funnel', 'Audience_group', 'Actual_Start_Date', 'Actual_End_Date', 'freetext_2']
metric_columns = ['Impression', 'Clicks', 'Views', 'Cost', 'Engagements',
                  'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 'Video_Plays_100']

# report_campaign_summary = (
#     report_campaign_overall_total
#     .groupby(summary_columns, dropna=False)[existing_metrics]
#     .sum()
#     .reset_index()
# )

# Filter only metrics that exist in the dataset
existing_metrics = [col for col in metric_columns if col in report_campaign_overall_total.columns]

In [142]:
report_campaign_overall_total['Campaign_phase'] = (
    report_campaign_overall_total['Campaign_phase']
    .replace('NA', '')   # Replace the string 'NA' with ''
    .fillna('')          # Then fill any NaN values with ''
)
report_campaign_overall_total = report_campaign_overall_total[
    (report_campaign_overall_total["Impression"] != 0)
]

In [143]:

inactive_campaigns['campaign_id'] = inactive_campaigns['campaign_id'].astype(str)
report_campaign_overall_total['campaign_id'] = report_campaign_overall_total['campaign_id'].astype(str)
df_reach_utd['campaign_id'] = df_reach_utd['campaign_id'].astype(str)

# Ensure date columns are in datetime format (important for accurate merging)
report_campaign_overall_total['report_date'] = pd.to_datetime(report_campaign_overall_total['report_date'])
inactive_campaigns['check_date'] = pd.to_datetime(inactive_campaigns['check_date'])

# Merge on both campaign_id and date
report_campaign_overall_total = report_campaign_overall_total.merge(
    inactive_campaigns.rename(columns={'check_date': 'report_date'}),  # align column names for merge
    on=['campaign_id', 'report_date'],
    how='left'
)
report_campaign_overall_total = report_campaign_overall_total.merge(
    df_reach_utd[['campaign_id', 'reach']],  # only bring in the reach column
    on='campaign_id',
    how='left'
)

report_campaign_overall_total = report_campaign_overall_total.merge(
    install_summary[['campaign_id', 'Installations']],  # only bring in the reach column
    on='campaign_id',
    how='left'
)

In [144]:
import pandas as pd

def preprocess_campaign_data(df):
    # --- Step 1: Rename + setup ---
    if df.empty:
        return pd.DataFrame()  # Return empty DataFrame if input is empty

    df = df.rename(columns={'CATE': 'Category'})
    
    numeric_cols = [
        'Impression', 'Clicks', 'Views', 'Cost', 'Engagements',
        'Video_Plays_25', 'Video_Plays_50', 'Video_Plays_75', 'Video_Plays_100',
        '23s_Video_Views', '6s_Engaged_Views', 'Installations', 'reach'
    ]
    
    group_cols = [
        'report_date', 'Category', 'Brand', 'Campaign', 'Platform', 'Audience_group',
        'Campaign_phase', 'Buying_Method', 'Plan_Start_Date', 'Plan_End_Date',
        'Actual_Start_Date', 'Actual_End_Date', 'Campaign_code', 'Format',
        'TA', 'Funnel', 'Region', 'freetext_2'
    ]
    
    agg_dict = {col: 'sum' for col in numeric_cols if col != 'reach'}
    agg_dict['reach'] = 'first'

    # --- Step 2: Ensure date columns are datetime ---
    df['report_date'] = pd.to_datetime(df['report_date'], errors='coerce')
    for date_col in ['Actual_Start_Date', 'Actual_End_Date', 'Plan_Start_Date', 'Plan_End_Date']:
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

    # --- Step 3: Group aggregate first ---
    groupdf = df.groupby(group_cols, dropna=False).agg(agg_dict).reset_index()

    # --- Step 4: Get min/max report_date per campaign ---
    campaign_report_dates = (
        groupdf.groupby('Campaign_code')['report_date']
        .agg(['min', 'max'])
        .reset_index()
        .rename(columns={'min': 'report_date_min', 'max': 'report_date_max'})
    )
    campaign_report_dates['report_date_min'] = campaign_report_dates['report_date_min'].dt.normalize()
    campaign_report_dates['report_date_max'] = campaign_report_dates['report_date_max'].dt.normalize()

    # --- Step 5: Build full date grid ---
    full_grid = []
    for _, row in campaign_report_dates.iterrows():
        campaign_code = row['Campaign_code']
        min_date = row['report_date_min']
        max_date = row['report_date_max']
        
        if pd.isna(min_date) or pd.isna(max_date):
            continue

        date_range = pd.date_range(min_date, max_date, freq='D')
        
        campaign_groups = groupdf[groupdf['Campaign_code'] == campaign_code]
        unique_groups = campaign_groups.drop(columns=['report_date'] + numeric_cols).drop_duplicates()
        
        for _, grp in unique_groups.iterrows():
            for d in date_range:
                full_grid.append({**grp.to_dict(), 'report_date': d})

    full_grid_df = pd.DataFrame(full_grid)

    # --- Step 6: Merge grid with grouped data ---
    groupdf = full_grid_df.merge(groupdf, on=group_cols, how='left')

    # New
    groupdf['freetext_2'] = groupdf.groupby(
    [col for col in group_cols if col != 'report_date']
)['freetext_2'].transform(lambda x: x.ffill().bfill())
    # --- Step 7: Fill missing numeric values ---
    for col in numeric_cols:
        if col not in groupdf.columns:
            groupdf[col] = 0
    groupdf[numeric_cols] = groupdf[numeric_cols].fillna(0)

    # --- Step 8: Merge report date min/max ---
    groupdf = groupdf.merge(campaign_report_dates, on='Campaign_code', how='left')

    # --- Step 9: Compute durations ---
    groupdf['campaign_duration'] = (groupdf['Actual_End_Date'] - groupdf['Actual_Start_Date']).dt.days.clip(lower=0)
    groupdf['plan_campaign_duration'] = (groupdf['Plan_End_Date'] - groupdf['Plan_Start_Date']).dt.days.clip(lower=0)

    # --- Step 10: Compute active days ---
    groupdf['active_day'] = (groupdf['report_date'] - groupdf['Actual_Start_Date']).dt.days
    groupdf['plan_active_day'] = (groupdf['report_date_max'] - groupdf['Plan_Start_Date']).dt.days

    # --- Step 11: Sort + compute cumulative sums ---
    groupby_cols_for_cumsum = [col for col in group_cols if col != 'report_date']
    groupdf = groupdf.sort_values(by=groupby_cols_for_cumsum + ['report_date'])
    
    for col in numeric_cols:
        if col != 'reach':
            groupdf[f'cumsum_{col}'] = groupdf.groupby(groupby_cols_for_cumsum)[col].cumsum()

    return groupdf

groupdf = preprocess_campaign_data(report_campaign_overall_total)


In [145]:
numeric_cols = ['Impression', 'Clicks', 'Views', 'Cost', 'Engagements', 'Video_Plays_25',
            'Video_Plays_50', 'Video_Plays_75', 'Video_Plays_100', '23s_Video_Views', 
            '6s_Engaged_Views', 'Installations', 'reach']
def replace_with_cumsum(df, keep=None, remove_prefix=False):
    """
    Replace original columns with their cumsum versions.

    Parameters:
    - df: DataFrame containing both original and cumsum_ columns.
    - keep: list of column names to keep as cumsum_ only (e.g. ['Clicks', 'Cost']).
    - remove_prefix: bool, if True, renames 'cumsum_Clicks' → 'Clicks'.

    Returns:
    - Modified DataFrame.
    """
    if keep is None:
        cols_to_drop = [col for col in df.columns if col.startswith('cumsum_')]
        return df.drop(columns=cols_to_drop)

    for col in keep:
        cumsum_col = f'cumsum_{col}'
        if col in df.columns and cumsum_col in df.columns:
            df = df.drop(columns=[col])
            if remove_prefix:
                df = df.rename(columns={cumsum_col: col})
        elif cumsum_col in df.columns and remove_prefix:
            df = df.rename(columns={cumsum_col: col})

    return df

groupdf2 = replace_with_cumsum(groupdf, keep=numeric_cols, remove_prefix=True)
groupdf1 = replace_with_cumsum(groupdf, keep=None)

In [146]:
groupdf2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24400 entries, 0 to 24399
Data columns (total 37 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Category                24400 non-null  object        
 1   Brand                   10161 non-null  object        
 2   Campaign                10161 non-null  object        
 3   Platform                10161 non-null  object        
 4   Audience_group          24400 non-null  object        
 5   Campaign_phase          24400 non-null  object        
 6   Buying_Method           23867 non-null  object        
 7   Plan_Start_Date         10161 non-null  datetime64[ns]
 8   Plan_End_Date           10161 non-null  datetime64[ns]
 9   Actual_Start_Date       4675 non-null   datetime64[ns]
 10  Actual_End_Date         4821 non-null   datetime64[ns]
 11  Campaign_code           24400 non-null  object        
 12  Format                  9917 non-null   object     

In [68]:
groupdf1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21917 entries, 0 to 21916
Data columns (total 37 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Category                21917 non-null  object        
 1   Brand                   8653 non-null   object        
 2   Campaign                8653 non-null   object        
 3   Platform                8653 non-null   object        
 4   Audience_group          21917 non-null  object        
 5   Campaign_phase          21917 non-null  object        
 6   Buying_Method           21423 non-null  object        
 7   Plan_Start_Date         8653 non-null   datetime64[ns]
 8   Plan_End_Date           8653 non-null   datetime64[ns]
 9   Actual_Start_Date       4555 non-null   datetime64[ns]
 10  Actual_End_Date         4695 non-null   datetime64[ns]
 11  Campaign_code           21917 non-null  object        
 12  Format                  8409 non-null   object     

In [147]:
import pandas as pd

def merge_ga_with_report(df_ga_summary, report_campaign_summary, merge_cols,
                         ga_prefix='ga', report_prefix='total'):
    """
    Normalize, group, and merge GA summary with campaign report summary.

    Parameters:
    -----------
    df_ga_summary : pd.DataFrame
        GA summary dataframe
    report_campaign_summary : pd.DataFrame
        Campaign report dataframe
    merge_cols : list of str
        Columns to normalize and merge on
    ga_prefix : str, optional
        Prefix for GA normalized columns (default 'ga')
    report_prefix : str, optional
        Prefix for report normalized columns (default 'total')

    Returns:
    --------
    pd.DataFrame
        Merged dataframe with summed GA metrics
    """
    
    def normalize_cols(df, cols, prefix):
        df = df.copy()  # Avoid modifying original
        for col in cols:
            if pd.api.types.is_datetime64_any_dtype(df[col]):
                df[f'{prefix}_{col}'] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')
            else:
                df[f'{prefix}_{col}'] = (
                    df[col].astype(str)
                          .str.strip()
                          .str.lower()
                          .str.replace(r'\s+', '', regex=True)
                )
        return df
    
    # Normalize both dataframes
    df_ga_norm = normalize_cols(df_ga_summary, merge_cols, ga_prefix)
    report_norm = normalize_cols(report_campaign_summary, merge_cols, report_prefix)
    
    # Group GA summary
    ga_grouped = df_ga_norm.groupby(
        [f'{ga_prefix}_{col}' for col in merge_cols],
        as_index=False
    ).sum(numeric_only=True)
    
    # Merge with report
    merged = pd.merge(
        report_norm,
        ga_grouped,
        how='left',
        left_on=[f'{report_prefix}_{col}' for col in merge_cols],
        right_on=[f'{ga_prefix}_{col}' for col in merge_cols],
        suffixes=('', f'_{ga_prefix}')
    )
    
    # Drop helper columns
    drop_cols = [col for col in merged.columns 
                 if col.startswith(f'{ga_prefix}_') 
                 or col.startswith(f'{report_prefix}_') 
                 or col.endswith(f'_{report_prefix}')]
    
    merged = merged.drop(columns=drop_cols)
    
    return merged

merge_cols = ['Campaign', 'report_date', 'Category', 'Platform', 'Brand', 'TA', 'Buying_Method', 'Actual_Start_Date']

merged_df_ga = merge_ga_with_report(df_ga_summary, groupdf2, merge_cols)
merged_df_ga1 = merge_ga_with_report(df_ga_summary, groupdf1, merge_cols)

In [148]:
df_plan = df_plan.rename(columns={"Camp_type": "Format"})

In [149]:
df_plan['Campaign_phase'].fillna('No Data', inplace=True)
merged_df_ga['Campaign_phase'].fillna('No Data', inplace=True)
merged_df_ga1['Campaign_phase'].fillna('No Data', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_plan['Campaign_phase'].fillna('No Data', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df_ga['Campaign_phase'].fillna('No Data', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

In [150]:
merged_df_ga['Campaign_phase'].replace('', pd.NA, inplace=True)
merged_df_ga.Campaign_phase.fillna('No Data', inplace=True)

merged_df_ga1['Campaign_phase'].replace('', pd.NA, inplace=True)
merged_df_ga1.Campaign_phase.fillna('No Data', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df_ga['Campaign_phase'].replace('', pd.NA, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df_ga.Campaign_phase.fillna('No Data', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

In [151]:
import pandas as pd
import numpy as np

def merge_with_optional_filter(
    df_plan, merged_df_ga, 
    merge_cols, 
    optional_col='optional', 
    freetext_col='freetext_2',
    placeholder='No Data'
):
    """
    Merge df_plan with merged_df_ga on merge_cols, handling optional column logic:
    - Rows without optional: normal merge.
    - Rows with optional: merge + filter where freetext_col contains optional_col.

    Parameters:
    - df_plan: DataFrame with plan data.
    - merged_df_ga: DataFrame with aggregated data.
    - merge_cols: list of columns to merge on.
    - optional_col: name of optional column (default 'optional').
    - freetext_col: name of freetext column to match against (default 'freetext_2').
    - placeholder: placeholder string for NA merge keys (default 'missing').

    Returns:
    - merged DataFrame.
    """
    # Fill NA for merge keys
    df_plan = df_plan.copy()
    merged_df_ga = merged_df_ga.copy()
    df_plan[merge_cols] = df_plan[merge_cols].fillna(placeholder)
    merged_df_ga[merge_cols] = merged_df_ga[merge_cols].fillna(placeholder)

    # Ensure date columns are datetime
    date_cols = ['Plan_Start_Date', 'Actual_Start_Date']
    for col in date_cols:
        if col in df_plan.columns:
            df_plan[col] = pd.to_datetime(df_plan[col], errors='coerce')
        if col in merged_df_ga.columns:
            merged_df_ga[col] = pd.to_datetime(merged_df_ga[col], errors='coerce')

    # Split by optional presence
    df_with_optional = df_plan[df_plan[optional_col].notna() | (df_plan[optional_col] != '')].copy()
    df_without_optional = df_plan[df_plan[optional_col].isna() | (df_plan[optional_col] == '')].copy()

    # Part 1: merge without optional
    merged_without_optional = pd.merge(
        df_without_optional,
        merged_df_ga,
        how='left',
        on=merge_cols,
        suffixes=('', '_total')
    )

    # Part 2: merge with optional + filter freetext
    temp_merge = pd.merge(
        df_with_optional,
        merged_df_ga,
        how='left',
        on=merge_cols,
        suffixes=('', '_total')
    )

    # Convert to unicode for np.char.find
    optional_array = temp_merge[optional_col].astype(str).to_numpy(dtype='U')
    freetext_array = temp_merge[freetext_col].astype(str).to_numpy(dtype='U')

    matches = np.char.find(freetext_array, optional_array) >= 0
    temp_merge_filtered = temp_merge[matches]

    # Combine
    merged_df = pd.concat([merged_without_optional, temp_merge_filtered], ignore_index=True)

    return merged_df


merge_cols = [
    'Campaign', 'Funnel', 'Category', 'Region', 'Platform', 'Brand', 'TA',
    'Buying_Method', 'Plan_Start_Date', 'Actual_Start_Date',
    'Audience_group', 'Campaign_phase'
]

merged_df = merge_with_optional_filter(
    df_plan,
    merged_df_ga,
    merge_cols,
    optional_col='optional',
    freetext_col='freetext_2'
)

merged_df1 = merge_with_optional_filter(
    df_plan,
    merged_df_ga1,
    merge_cols,
    optional_col='optional',
    freetext_col='freetext_2'
)

  merged_df_ga[col] = pd.to_datetime(merged_df_ga[col], errors='coerce')
  merged_df_ga[col] = pd.to_datetime(merged_df_ga[col], errors='coerce')
  merged_df_ga[col] = pd.to_datetime(merged_df_ga[col], errors='coerce')
  merged_df_ga[col] = pd.to_datetime(merged_df_ga[col], errors='coerce')


In [152]:
merged_df = merged_df.drop(columns=[
    col for col in merged_df.columns
    if any(col == f'plan_{base}' or col == f'{base}_total' for base in merge_cols)
])
merged_df

merged_df1 = merged_df1.drop(columns=[
    col for col in merged_df1.columns
    if any(col == f'plan_{base}' or col == f'{base}_total' for base in merge_cols)
])
merged_df1

Unnamed: 0,Funnel,Campaign,Category,Brand,Plan_Start_Date,Plan_End_Date,Line_code,Format,Region,Platform,...,campaign_duration,plan_campaign_duration,active_day,plan_active_day,sessions,engaged_sessions,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views
0,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,13.0,13.0,0.0,12.0,,,,,,
1,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,13.0,13.0,1.0,12.0,,,,,,
2,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,13.0,13.0,2.0,12.0,,,,,,
3,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,13.0,13.0,3.0,12.0,,,,,,
4,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,13.0,13.0,4.0,12.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3467,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,,,,,,
3468,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,,,,,,
3469,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,,,,,,
3470,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,,,,,,


In [153]:
import numpy as np



merged_df['report_date'] = pd.to_datetime(merged_df['report_date'])
merged_df['Plan_Start_Date'] = pd.to_datetime(merged_df['Plan_Start_Date'])

# Calculate custom week number from Start_Date_x, starting from Monday
merged_df['weeknum'] = ((merged_df['report_date'] - merged_df['Plan_Start_Date']).dt.days // 7) + 1
merged_df['weeknum'] = merged_df['weeknum'].fillna(0).astype(int)
# Format the new date column
merged_df['formatted_date'] = merged_df['report_date'].dt.strftime('%b %d, %Y') + \
                               ' - ' + merged_df['report_date'].dt.strftime('%a') + \
                               ' - Week ' + merged_df['weeknum'].astype(str)

# latest_active_df = latest_active_df.rename(columns={
#     'campaign_code': 'Campaign_code',
#     'latest_active_date': 'Lasted_Day'
# })

# # Perform the merge
# merged_df = merged_df.merge(
#     latest_active_df,
#     on='Campaign_code',
#     how='left'
# )

merged_df['Audience'] = merged_df['Audience_group'].astype(str) + '\n' + merged_df['TA'].astype(str)


# Define conditions and corresponding choices
conditions = [
    merged_df['KPI_Metric'] == 'Impression',
    merged_df['KPI_Metric'] == 'View',
    merged_df['KPI_Metric'] == 'Engagement',
    merged_df['KPI_Metric'] == 'Click',
    merged_df['KPI_Metric'] == 'AppInstall',
    merged_df['KPI_Metric'] == 'Reach'

]

choices = [
    merged_df['Impression'],
    merged_df['Views'],
    merged_df['Engagements'],
    merged_df['Clicks'],
    merged_df['Installations'],
    merged_df['reach']

]

# Create KPI_actual column
merged_df['KPI_actual'] = np.select(conditions, choices, default=np.nan)

In [154]:
merged_df1['Audience'] = merged_df1['Audience_group'].astype(str) + '\n' + merged_df['TA'].astype(str)


In [155]:
merged_df['Lasted'] = merged_df['report_date'] == merged_df['report_date_max']

In [159]:
import pandas as pd

# Ensure 'report_date' is a datetime type
merged_df['report_date'] = pd.to_datetime(merged_df['report_date'], errors='coerce')

# Get today's date
today = pd.to_datetime('today').normalize()

# Compute base run_rate
merged_df['run_rate'] = merged_df['active_day'] / merged_df['campaign_duration']

# Apply condition: if Lasted == True and report_date < today
condition = (merged_df['Lasted'] == True) & (merged_df['report_date'] < today)
merged_df.loc[condition, 'run_rate'] = 1.0

# Cap run_rate at 1.0
merged_df['run_rate'] = merged_df['run_rate'].clip(upper=1.0)

In [162]:
merged_df

Unnamed: 0,Funnel,Campaign,Category,Brand,Plan_Start_Date,Plan_End_Date,Line_code,Format,Region,Platform,...,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views,weeknum,formatted_date,Audience,KPI_actual,Lasted,run_rate
0,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,1,"May 12, 2025 - Mon - Week 1",MIXED\nF25-44,0.0,False,0.000000
1,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,1,"May 13, 2025 - Tue - Week 1",MIXED\nF25-44,1712980.0,False,0.076923
2,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,1,"May 14, 2025 - Wed - Week 1",MIXED\nF25-44,1712980.0,False,0.153846
3,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,1,"May 15, 2025 - Thu - Week 1",MIXED\nF25-44,1712980.0,False,0.230769
4,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,1,"May 16, 2025 - Fri - Week 1",MIXED\nF25-44,1712980.0,False,0.307692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3467,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,0,,Group3\nF25-44,,False,
3468,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,0,,Group2\nF25-44,,False,
3469,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,0,,Group1\nF25-44,,False,
3470,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,0,,Group1\nF25-44,,False,


In [163]:
print("Negative active_day values:\n", merged_df[merged_df['active_day'] < 0])
print("Negative campaign_duration values:\n", merged_df[merged_df['campaign_duration'] < 0])

Negative active_day values:
          Funnel          Campaign Category  Brand Plan_Start_Date  \
26    Awareness       ADMMVLaunch       SN    ADM      2025-05-12   
27    Awareness       ADMMVLaunch       SN    ADM      2025-05-12   
52    Awareness       ADMMVLaunch       SN    ADM      2025-05-12   
53    Awareness       ADMMVLaunch       SN    ADM      2025-05-12   
65    Awareness       ADMMVLaunch       SN    ADM      2025-05-12   
...         ...               ...      ...    ...             ...   
3237  Awareness  FM100PromoAprMay       SN  FM100      2025-03-24   
3238  Awareness  FM100PromoAprMay       SN  FM100      2025-03-24   
3239  Awareness  FM100PromoAprMay       SN  FM100      2025-03-24   
3240  Awareness  FM100PromoAprMay       SN  FM100      2025-03-24   
3241  Awareness  FM100PromoAprMay       SN  FM100      2025-03-24   

     Plan_End_Date  Line_code               Format Region    Platform  ...  \
26      2025-05-25        3.0  View Reach Campaign    35C  Googl

In [161]:
import numpy as np

merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [164]:
from sqlalchemy import create_engine
import pandas as pd

DB_TYPE = 'mysql'  # Change to 'postgresql' for PostgreSQL
DB_HOST = '125.212.245.36'
DB_PORT = '3306'  # Change for different databases
DB_USER = 'dashboard_report_vinamilk'
DB_PASS = '6vtRhNxa79zPsfs5'
DB_NAME = 'dashboard_report_vinamilk'

try:
    conn = mysql.connector.connect(
        host=DB_HOST,
        user=DB_USER,
        password=DB_PASS,
        database=DB_NAME,
        charset="utf8mb4"
    )
    
    if conn.is_connected():
        cursor = conn.cursor()
        print("Connected to MySQL successfully!")
    else:
        print("Failed to connect.")

except mysql.connector.Error as e:
    print(f"Error: {e}")

# Create SQLAlchemy engine
cursor = conn.cursor()
cursor.execute("SET NAMES utf8mb4;") 
cursor.execute("SET CHARACTER SET utf8mb4;")
engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

def upload_to_mysql(df, table_name, engine, if_exists='replace'):
    """
    Uploads a DataFrame to a MySQL table with correct column types based on the DataFrame.

    :param df: Pandas DataFrame to upload
    :param table_name: Name of the table in the database
    :param engine: SQLAlchemy engine object
    :param if_exists: What to do if the table already exists ('fail', 'replace', 'append')
    """
    try:
        with engine.connect() as conn:
            metadata = MetaData()

            # Define table schema based on DataFrame's column types
            columns = []

            for col in df.columns:
                dtype = df[col].dtype
                
                # Map Pandas dtype to MySQL/SQLAlchemy types
                if pd.api.types.is_integer_dtype(dtype):
                    col_type = Integer
                elif pd.api.types.is_float_dtype(dtype):
                    col_type = Float
                elif pd.api.types.is_datetime64_any_dtype(dtype):
                    col_type = DateTime
                elif pd.api.types.is_bool_dtype(dtype):
                    col_type = Boolean
                elif pd.api.types.is_string_dtype(dtype):
                    col_type = Text(collation="utf8mb4_unicode_ci")  # UTF-8 support
                else:
                    col_type = String(255, collation="utf8mb4_unicode_ci")  # Default fallback

                columns.append(Column(col, col_type))

            table = Table(table_name, metadata, *columns, extend_existing=True)

            # Create the table with utf8mb4 encoding
            metadata.create_all(conn)

        # Convert object columns to string before uploading
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].astype(str)

        # Upload DataFrame to MySQL with proper type mapping
        dtype_mapping = {
            col: (
                sqlalchemy.types.Integer if pd.api.types.is_integer_dtype(df[col]) else
                sqlalchemy.types.Float if pd.api.types.is_float_dtype(df[col]) else
                sqlalchemy.types.DateTime if pd.api.types.is_datetime64_any_dtype(df[col]) else
                sqlalchemy.types.Boolean if pd.api.types.is_bool_dtype(df[col]) else
                sqlalchemy.types.Text(collation="utf8mb4_unicode_ci") if pd.api.types.is_string_dtype(df[col]) else
                sqlalchemy.types.String(255, collation="utf8mb4_unicode_ci")
            )
            for col in df.columns
        }

        df.to_sql(table_name, engine, if_exists=if_exists, index=False, dtype=dtype_mapping)

        print(f"✅ Data successfully uploaded to `{table_name}` with correct column types!")

    except Exception as e:
        print(f"❌ Error uploading data to `{table_name}`: {e}")

# Example usage:
# engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")
upload_to_mysql(merged_df, 'report_campaign_overall_total', engine)
upload_to_mysql(merged_df1, 'report_campaign_overall_total_notcs', engine)

Connected to MySQL successfully!
✅ Data successfully uploaded to `report_campaign_overall_total` with correct column types!
✅ Data successfully uploaded to `report_campaign_overall_total_notcs` with correct column types!


In [171]:
df_yt_filter.columns

Index(['report_date', 'customer', 'customer_id', 'campaign_id', 'ad_group',
       'ad_group_id', 'ad_id', 'currency_code', 'impressions', 'clicks',
       'cost', 'views', 'engagements', 'video_25', 'video_50', 'video_75',
       'video_100', 'conversions', 'view_through_conv', 'all_conv_value',
       'revenue_micros', 'orders', 'ad_group_primary_status', 'campaign_name',
       'ad_name', 'Funnel', 'camp_code', 'platform', 'camp_type',
       'buying_type', 'free_text1', 'year', 'KPI Metric', 'buying_model',
       'placement', 'Campaign phase', 'freetext_2', 'TA', 'bidding_strategy',
       'creative_type', 'creative_format', 'creative_length', 'freetext_3',
       'Category', 'Brand Name', 'Campaign Name', 'Plan Start Date',
       'Plan End Date', 'Audience group', 'Gender', 'Age', 'Region', 'Device',
       'Os', 'ad_format', 'Actual Start Date', 'Actual End Date', 'Platform',
       'Cost'],
      dtype='object')

In [80]:
merged_df


Unnamed: 0,Funnel,Campaign,Category,Brand,Plan_Start_Date,Plan_End_Date,Line_code,Format,Region,Platform,...,engaged_sessions,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views,weeknum,formatted_date,Audience,KPI_actual,Lasted
0,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,,1,"May 12, 2025 - Mon - Week 1",MIXED\nF25-44,0.0,False
1,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,,1,"May 13, 2025 - Tue - Week 1",MIXED\nF25-44,1712980.0,False
2,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,,1,"May 14, 2025 - Wed - Week 1",MIXED\nF25-44,1712980.0,False
3,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,,1,"May 15, 2025 - Thu - Week 1",MIXED\nF25-44,1712980.0,False
4,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,,,,,1,"May 16, 2025 - Fri - Week 1",MIXED\nF25-44,1712980.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3293,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,,0,,Group3\nF25-44,,False
3294,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,,0,,Group2\nF25-44,,False
3295,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,,0,,Group1\nF25-44,,False
3296,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,,,0,,Group1\nF25-44,,False


In [379]:
merged_df_ga1[merged_df_ga1['Campaign_code'].str.contains('FinoPromoSustain')]

Unnamed: 0,Category,Brand,Campaign,Platform,Audience_group,Campaign_phase,Buying_Method,Plan_Start_Date,Plan_End_Date,Actual_Start_Date,...,ga_Brand,ga_TA,ga_Buying_Method,ga_Actual_Start_Date,sessions,engaged_sessions,add_to_carts,purchase_revenue,ecommerce_purchases,screen_page_views
19310,SN,Fino,FinoPromoSustain,Facebook,Group1 (focus),No data,CPM,2025-03-12,2025-03-31,2025-03-12,...,,,,,,,,,,
19311,SN,Fino,FinoPromoSustain,Facebook,Group1 (focus),No data,CPM,2025-03-12,2025-03-31,2025-03-12,...,,,,,,,,,,
19312,SN,Fino,FinoPromoSustain,Facebook,Group1 (focus),No data,CPM,2025-03-12,2025-03-31,2025-03-12,...,,,,,,,,,,
19313,SN,Fino,FinoPromoSustain,Facebook,Group1 (focus),No data,CPM,2025-03-12,2025-03-31,2025-03-12,...,,,,,,,,,,
19314,SN,Fino,FinoPromoSustain,Facebook,Group1 (focus),No data,CPM,2025-03-12,2025-03-31,2025-03-12,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19571,SN,Fino,FinoPromoSustain,YouTube,MIXED,No data,CPM,2025-03-12,2025-03-31,2025-03-21,...,,,,,,,,,,
19572,SN,Fino,FinoPromoSustain,YouTube,MIXED,No data,CPM,2025-03-12,2025-03-31,2025-03-21,...,,,,,,,,,,
19573,SN,Fino,FinoPromoSustain,YouTube,MIXED,No data,CPM,2025-03-12,2025-03-31,2025-03-21,...,,,,,,,,,,
19574,SN,Fino,FinoPromoSustain,YouTube,MIXED,No data,CPM,2025-03-12,2025-03-31,2025-03-21,...,,,,,,,,,,


In [345]:
df_plan

Unnamed: 0,Funnel,Campaign,Category,Brand,Plan_Start_Date,Plan_End_Date,Line_code,Format,Region,Platform,...,frequency_combine_estimate_week,er_estimate,er_bm,vtr_estimate,vtr_bm,ctr_estimate,ctr_bm,exchange_rate,Actual_Start_Date,Actual_End_Date
0,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,1.0,Reach,26C,Facebook,...,,0.025,,0.021000,,0.00150,0.0010,26000.0,2025-05-12,2025-05-25
1,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,2.0,Reach,37C,Facebook,...,,0.025,,0.021000,,0.00150,0.0010,26000.0,2025-05-12,2025-05-25
2,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,3.0,View Reach Campaign,35C,Google Ads,...,,,,0.145000,,0.00200,0.0010,26000.0,2025-05-14,2025-05-25
3,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,,View Reach Campaign,26C,Google Ads,...,,,,0.145000,,0.00200,0.0010,26000.0,2025-05-12,2025-05-25
4,Awareness,ADMMVLaunch,SN,ADM,2025-05-12,2025-05-25,,View Reach Campaign,NAT,Google Ads,...,,,,0.230000,,0.00200,0.0010,26000.0,2025-05-14,2025-05-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,0.012340,0.04,0.00080,0.0017,26000.0,2025-01-03,2025-01-21
106,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,0.009538,0.04,0.00080,0.0017,26000.0,2025-01-03,2025-01-21
107,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,0.027049,0.04,0.00100,0.0017,26000.0,2025-01-18,2025-01-21
108,Awareness,FM100PromoSustainJan,SN,FM100,2025-01-30,2025-01-21,,Reach,NAT,TikTok,...,,,,0.030000,0.04,0.00100,0.0017,26000.0,2025-01-03,2025-01-21


In [346]:
df_tt

Unnamed: 0,report_date,account_id,account_name,campaign_id,adgroup_id,adgroup_name,advertising_objective,ad_id,impressions,spend,...,ad_text,live_views,onsite_shopping,total_onsite_shopping_value,shop_total_items_purchased,onsite_initiate_checkout_count,onsite_on_web_cart,total_onsite_on_web_cart_value,campaign_name,ad_name
0,2025-05-08,7451448453630033921,VNM_SN_2025,1831635835217953,1832157308609553,PRO-F-2544-5C-AllDV-AllOS_AW-SN-ADM_ADMMVLaunc...,REACH,1832157309644849,0,0.000000,...,"Sữa ADM 23 dưỡng chất, cùng mẹ nuôi dưỡng ước ...",0,0,0,0,0,0,0,AW_SN-ADM-ADMMVLaunch-1205-2505_TT_REA_AUC_MIX...,VID_VER_6S_PRO-F-2544-5C-AllDV-AllOS_AW-SN-ADM...
1,2025-05-08,7451448453630033921,VNM_SN_2025,1831635835217953,1832157308609553,PRO-F-2544-5C-AllDV-AllOS_AW-SN-ADM_ADMMVLaunc...,REACH,1832157309654033,0,0.000000,...,"Sữa ADM 23 dưỡng chất, cùng mẹ nuôi dưỡng ước ...",0,0,0,0,0,0,0,AW_SN-ADM-ADMMVLaunch-1205-2505_TT_REA_AUC_MIX...,VID_VER_6S_PRO-F-2544-5C-AllDV-AllOS_AW-SN-ADM...
2,2025-05-08,7451448453630033921,VNM_SN_2025,1831635835217953,1832157079635058,PRO-F-2544-33C-AllDV-AllOS_AW-SN-ADM_ADMMVLaun...,REACH,1832157080213505,0,0.000000,...,"Sữa ADM 23 dưỡng chất, cùng mẹ nuôi dưỡng ước ...",0,0,0,0,0,0,0,AW_SN-ADM-ADMMVLaunch-1205-2505_TT_REA_AUC_MIX...,VID_VER_6S_PRO-F-2544-33C-AllDV-AllOS_AW-SN-AD...
3,2025-05-08,7451448453630033921,VNM_SN_2025,1831635835217953,1832157079635058,PRO-F-2544-33C-AllDV-AllOS_AW-SN-ADM_ADMMVLaun...,REACH,1832157080218721,0,0.000000,...,"Sữa ADM 23 dưỡng chất, cùng mẹ nuôi dưỡng ước ...",0,0,0,0,0,0,0,AW_SN-ADM-ADMMVLaunch-1205-2505_TT_REA_AUC_MIX...,VID_VER_6S_PRO-F-2544-33C-AllDV-AllOS_AW-SN-AD...
4,2025-05-08,7451448453630033921,VNM_SN_2025,1831629055212609,1832155793506402,PRO-F-2544-26C-AllDV-AllOS_AW-SN-ADM_ADMMVLaun...,REACH,1832155793532946,0,0.000000,...,"Sữa ADM 23 dưỡng chất, cùng mẹ nuôi dưỡng ước ...",0,0,0,0,0,0,0,AW_SN-ADM-ADMMVLaunch-1205-2505_TT_REA_AUC_MIX...,VID_VER_6S_PRO-F-2544-26C-AllDV-AllOS_AW-SN-AD...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560646,2025-06-05,7451532856985616385,VNM_Ecommerce_2025,1829357618415777,1829476045691937,RTG-LAL-P-1854-NAT-AllDV-AllOS_CVR-AlLCate-DTC...,PRODUCT_SALES,1829476051515409,0,0.000000,...,,0,0,0,0,0,0,0,CVR-AlLCate-DTC-KOL-Livestream-1504-1504_TT_LS...,VID_VER_NA_MIXED-P-1854-NAT-AllDV-AllOS_CVR-Al...
560647,2025-06-05,7451532856985616385,VNM_Ecommerce_2025,1829004618897633,1829473824999442,RTG-LAL-P-1854-NAT-AllDV-AllOS_CVR-SN-GreenFar...,PRODUCT_SALES,1829473825675346,0,0.000000,...,,0,0,0,0,0,0,0,CVR-SN-GreenFarm-AWO-Livestream-1004-3112_TT_L...,VID_VER_NA_MIXED-P-1854-NAT-AllDV-AllOS_CVR-SN...
560648,2025-06-05,7451532856985616385,VNM_Ecommerce_2025,1829357618415777,1829461952193569,RTG-LAL-P-1854-NAT-AllDV-AllOS_CVR-AlLCate-DTC...,PRODUCT_SALES,1829461954807953,0,0.000000,...,,0,0,0,0,0,0,0,CVR-AlLCate-DTC-KOL-Livestream-1504-1504_TT_LS...,VID_VER_NA_MIXED-P-1854-NAT-AllDV-AllOS_CVR-Al...
560649,2025-06-05,7451532856985616385,VNM_Ecommerce_2025,1829357618415777,1829449250161745,PROS-P-1854-NAT-AllDV-AllOS_CVR-AlLCate-DTC-KO...,PRODUCT_SALES,1829449251208370,0,0.000000,...,,0,0,0,0,0,0,0,CVR-AlLCate-DTC-KOL-Livestream-1504-1504_TT_LS...,VID_VER_NA_MIXED-P-1854-NAT-AllDV-AllOS_CVR-Al...


In [347]:
df_tt_check = df_tt[df_tt['campaign_name'].str.contains('cvr-sn', case=False)]
df_tt_check.campaign_name.unique()

array(['CVR-SN-FM100-B2BFM100FlavorApr-Jun-3105-3105_TT_LSA_AUC_MIXED-P-1854-15C-AllDV-AllOS_2025_ROAS_CPC_AllPL_',
       'CVR-SN-FM100-B2BFM100FlavorApr-Jun-3105-3105_TT_LSA_AUC_MIXED-P-1854-13C-AllDV-AllOS_2025_ROAS_CPC_AllPL_',
       'CVR-SN-FM100-B2BFM100FlavorApr-Jun-2905-2905_TT_LSA_AUC_MIXED-P-1854-15C-AllDV-AllOS_2025_ROAS_CPC_AllPL_',
       'CVR-SN-FM100-B2BFM100FlavorApr-Jun-2905-2905_TT_LSA_AUC_MIXED-P-1854-13C-AllDV-AllOS_2025_ROAS_CPC_AllPL_',
       'CVR-SN-FM100-B2BFM100FlavorApr-Jun-2705-2705_TT_LSA_AUC_MIXED-P-1854-15C-AllDV-AllOS_2025_ROAS_CPC_AllPL_',
       'CVR-SN-FM100-B2BFM100FlavorApr-Jun-2505-2505_TT_LSA_AUC_MIXED-P-1854-15C-AllDV-AllOS_2025_ROAS_CPC_AllPL_',
       'CVR-SN-FM100-B2BFM100FlavorApr-Jun-2705-2705_TT_LSA_AUC_MIXED-P-1854-13C-AllDV-AllOS_2025_ROAS_CPC_AllPL_',
       'CVR-SN-FM100-B2BFM100FlavorApr-Jun-2505-2505_TT_LSA_AUC_MIXED-P-1854-13C-AllDV-AllOS_2025_ROAS_CPC_AllPL_',
       'CVR-SN-GreenFarm-AWO-Livestream-1004-3112_TT_LSA_AUC_MIXED-P-185

In [348]:
df_yt

Unnamed: 0,report_date,customer,customer_id,campaign_id,ad_group,ad_group_id,ad_id,currency_code,impressions,clicks,...,engagements,video_25,video_50,video_75,video_100,conversions,view_through_conv,all_conv_value,campaign_name,ad_name
0,2025-01-01,VNM_D2C_ALL,6909791966,22139891518,Ad group 1,176460456671,729420008371,USD,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025_GOOGLE_ECOM_AllCate_AllBrand_D2C_DSA_AUCT...,
1,2025-01-01,VNM_D2C_ALL,6909791966,22139891518,All Cate,174967321323,734071089974,USD,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025_GOOGLE_ECOM_AllCate_AllBrand_D2C_DSA_AUCT...,
2,2025-01-01,VNM_D2C_ALL,6909791966,22139891518,BDD,178875007434,735913918223,USD,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025_GOOGLE_ECOM_AllCate_AllBrand_D2C_DSA_AUCT...,
3,2025-01-01,VNM_D2C_ALL,6909791966,22139891518,Dielac Grow Plus_SKU,175357483163,735279754255,USD,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025_GOOGLE_ECOM_AllCate_AllBrand_D2C_DSA_AUCT...,
4,2025-01-01,VNM_D2C_ALL,6909791966,22139891518,Green Farm,172204298381,733957810719,USD,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025_GOOGLE_ECOM_AllCate_AllBrand_D2C_DSA_AUCT...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176808,2025-06-04,VNM_SCUMS_Probi_2025_VND,9310672042,22477933642,SCU cách dùng,177701619319,748041428804,VND,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CSD-SCUMS-Probi-SustainAprJun-2304-1506_GG_SEM...,
176809,2025-06-04,VNM_SCUMS_Probi_2025_VND,9310672042,22477933642,SCU Chung,177704615439,747949895442,VND,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CSD-SCUMS-Probi-SustainAprJun-2304-1506_GG_SEM...,
176810,2025-06-04,VNM_SCUMS_Probi_2025_VND,9310672042,22477933642,SCU cho bé,178471521277,747969615826,VND,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CSD-SCUMS-Probi-SustainAprJun-2304-1506_GG_SEM...,
176811,2025-06-04,VNM_SCUMS_Probi_2025_VND,9310672042,22477933642,SCU tác dụng,178954129616,748061223587,VND,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CSD-SCUMS-Probi-SustainAprJun-2304-1506_GG_SEM...,


In [349]:
df_fb

Unnamed: 0,report_date,account_id,account_name,campaign_id,ad_group_id,ad_group_name,ad_id,cost,currency,impression,...,leads,on_facebook_leads,outbound_clicks,thruplays,post_save,purchase,video_played_2,custom_conversion,campaign_name,ad_name
0,2025-03-26,1575370433370329,VNM_SN_2025,120218678452170524,120218678452370524,OptREACH_PRO_F_25-44_NAT_Affinity_Group1-Exclu...,120218678452190524,541634.0,VND,46835,...,0,0,78,0,1,1,0,2,AW_SN-Fino-FinoPromoSustain-1203-3103_FB_REA_A...,VID_SQUARE_6S_MIXED-F-2544-28C-AllDV-AllOS_AW-...
1,2025-03-26,1575370433370329,VNM_SN_2025,120218678452170524,120218678452370524,OptREACH_PRO_F_25-44_NAT_Affinity_Group1-Exclu...,120218678985360524,762800.0,VND,63381,...,0,0,45,557,0,0,0,0,AW_SN-Fino-FinoPromoSustain-1203-3103_FB_REA_A...,video_VNM Sữa Dinh Dưỡng Mua 1 thùng tặng 2 bị...
2,2025-03-26,1575370433370329,VNM_SN_2025,120218704586520524,120218704586490524,OptREACH_PRO_F_25-44_NAT_Affinity_Group2-Exclu...,120218704586500524,20947.0,VND,2388,...,0,0,1,11,0,0,0,0,AW_SN-Fino-FinoPromoSustain-1203-3103_FB_REA_A...,video_VNM Sữa Dinh Dưỡng Mua 1 thùng tặng 2 bị...
3,2025-03-26,1575370433370329,VNM_SN_2025,120218704586520524,120218704586490524,OptREACH_PRO_F_25-44_NAT_Affinity_Group2-Exclu...,120218704586510524,25721.0,VND,3039,...,0,0,1,0,0,0,0,0,AW_SN-Fino-FinoPromoSustain-1203-3103_FB_REA_A...,photo_VNM Sữa Dinh Dưỡng Mua 1 thùng tặng 2 bị...
4,2025-03-26,1575370433370329,VNM_SN_2025,120218704800400524,120218704800410524,OptREACH_PRO_F_25-44_NAT_Affinity_Group2-Exclu...,120218704800420524,0.0,VND,0,...,0,0,0,0,0,0,0,0,AW_SN-Fino-FinoPromoSustain-1203-3103_FB_REA_A...,photo_VNM Sữa Dinh Dưỡng Mua 1 thùng tặng 2 bị...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24703,2025-06-10,896194135934321,VNM_D2C_ALL,120223624192080292,120223624192400292,PRODUCTS_All_New Customer_June,120224221646500292,1897.0,VND,22,...,0,0,0,1,0,0,0,0,2025_FACEBOOK_ECOM_AllCate_AllBrand_D2C_FBConv...,PRODUCTS_SCUGF_VIDEO_KOL_VNMLYHEALTHY1
24704,2025-06-10,896194135934321,VNM_D2C_ALL,120223624192080292,120223624192400292,PRODUCTS_All_New Customer_June,120224222114370292,2087.0,VND,38,...,0,0,2,3,0,0,0,1,2025_FACEBOOK_ECOM_AllCate_AllBrand_D2C_FBConv...,PRODUCTS_SCUGF_VIDEO_KOL_VNMLINHTHI1
24705,2025-06-10,896194135934321,VNM_D2C_ALL,120223624192080292,120223624192400292,PRODUCTS_All_New Customer_June,120224222354090292,1849.0,VND,37,...,0,0,0,1,0,0,0,0,2025_FACEBOOK_ECOM_AllCate_AllBrand_D2C_FBConv...,PRODUCTS_SCA_VIDEO_KOL_VNMQUYREVIEW1
24706,2025-06-10,896194135934321,VNM_D2C_ALL,120223624192080292,120223624192400292,PRODUCTS_All_New Customer_June,120224222748180292,380627.0,VND,8246,...,0,0,123,428,0,6,0,8,2025_FACEBOOK_ECOM_AllCate_AllBrand_D2C_FBConv...,PRODUCTS_FM100Banana_VIDEO_KOL_VNMTHUONGOI1
