# Library Things

In [None]:
!pip install duckdb --upgrade



In [None]:
!pip install polars --upgrade

Collecting polars
  Downloading polars-1.10.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading polars-1.10.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.2/33.2 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 1.7.1
    Uninstalling polars-1.7.1:
      Successfully uninstalled polars-1.7.1
Successfully installed polars-1.10.0


In [None]:
import duckdb
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import HTML

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


# The Tables Preparation

In [None]:
# Connect to DuckDB
conn = duckdb.connect()

# Create the temporary table for storing Main data
create_main_table_query = """
CREATE TEMPORARY TABLE main (
    date DATE,
    campaign_id VARCHAR,
    campaign_name VARCHAR,
    adset_id VARCHAR,
    adset_name VARCHAR,
    ad_id VARCHAR,
    ad_name VARCHAR,
    objective VARCHAR,
    adset_optimization_goal VARCHAR,
    publisher_platform VARCHAR,
    platform_position VARCHAR,
    impressions FLOAT,
    clicks FLOAT,
    video_view FLOAT,
    spend FLOAT
);
"""

# Create the temporary table for storing Omni data
create_omni_table_query = """
CREATE TEMPORARY TABLE omni (
    date DATE,
    campaign_id VARCHAR,
    campaign_name VARCHAR,
    adset_id VARCHAR,
    adset_name VARCHAR,
    ad_id VARCHAR,
    ad_name VARCHAR,
    cpas_view_content FLOAT,
    cpas_add_to_cart FLOAT,
    cpas_purchase FLOAT,
    cpas_purchase_value FLOAT
);
"""

# Create the temporary table for storing Reach data
create_reach_table_query = """
CREATE TEMPORARY TABLE reach (
    date DATE,
    campaign_id VARCHAR,
    campaign_name VARCHAR,
    weekly_reach FLOAT
);
"""

# Execute the query to create the temporary table
conn.execute(create_main_table_query)
conn.execute(create_omni_table_query)
conn.execute(create_reach_table_query)

<duckdb.duckdb.DuckDBPyConnection at 0x7b4646471db0>

In [None]:
# Define path for each CSV files
main_csv_file_path = "/content/drive/MyDrive/GroupM Technical Test/data/Main.csv"
omni_csv_file_path = "/content/drive/MyDrive/GroupM Technical Test/data/Omni.csv"
reach_csv_file_path = "/content/drive/MyDrive/GroupM Technical Test/data/Reach.csv"

# Load the data from the CSV file
load_main_table_query = f"COPY main FROM '{main_csv_file_path}';"
load_omni_table_query = f"COPY omni FROM '{omni_csv_file_path}';"
load_reach_table_query = f"COPY reach FROM '{reach_csv_file_path}';"

# Execute the query to load the data from the CSV file
conn.execute(load_main_table_query)
conn.execute(load_omni_table_query)
conn.execute(load_reach_table_query)

<duckdb.duckdb.DuckDBPyConnection at 0x7b4646471db0>

In [None]:
# Verify the insertion by selecting some data
conn.sql("SELECT * FROM main LIMIT 5;").show()

┌────────────┬────────────────────┬───────────────┬────────────────────┬────────────┬────────────────────┬─────────┬───────────────────────┬─────────────────────────┬────────────────────┬───────────────────┬─────────────┬────────┬────────────┬─────────┐
│    date    │    campaign_id     │ campaign_name │      adset_id      │ adset_name │       ad_id        │ ad_name │       objective       │ adset_optimization_goal │ publisher_platform │ platform_position │ impressions │ clicks │ video_view │  spend  │
│    date    │      varchar       │    varchar    │      varchar       │  varchar   │      varchar       │ varchar │        varchar        │         varchar         │      varchar       │      varchar      │    float    │ float  │   float    │  float  │
├────────────┼────────────────────┼───────────────┼────────────────────┼────────────┼────────────────────┼─────────┼───────────────────────┼─────────────────────────┼────────────────────┼───────────────────┼─────────────┼────────┼────────

In [None]:
# Verify the insertion by selecting some data
conn.sql("SELECT * FROM omni LIMIT 5;").show()

┌────────────┬────────────────────┬───────────────┬────────────────────┬────────────┬────────────────────┬─────────┬───────────────────┬──────────────────┬───────────────┬─────────────────────┐
│    date    │    campaign_id     │ campaign_name │      adset_id      │ adset_name │       ad_id        │ ad_name │ cpas_view_content │ cpas_add_to_cart │ cpas_purchase │ cpas_purchase_value │
│    date    │      varchar       │    varchar    │      varchar       │  varchar   │      varchar       │ varchar │       float       │      float       │     float     │        float        │
├────────────┼────────────────────┼───────────────┼────────────────────┼────────────┼────────────────────┼─────────┼───────────────────┼──────────────────┼───────────────┼─────────────────────┤
│ 2021-09-20 │ TYXM59637062913172 │ Campaign 6    │ URGG05824517958902 │ Adset 27   │ RUTI52564008529907 │ Ad 71   │              NULL │             NULL │          NULL │                NULL │
│ 2021-09-20 │ TYXM59637062913

In [None]:
# Verify the insertion by selecting some data
conn.sql("SELECT * FROM reach LIMIT 5;").show()

┌────────────┬────────────────────┬───────────────┬──────────────┐
│    date    │    campaign_id     │ campaign_name │ weekly_reach │
│    date    │      varchar       │    varchar    │    float     │
├────────────┼────────────────────┼───────────────┼──────────────┤
│ 2021-06-14 │ FHBM76189796639257 │ Campaign 2    │     420932.0 │
│ 2021-09-27 │ OUGE58946647411374 │ Campaign 7    │      90288.0 │
│ 2021-09-27 │ TYXM59637062913172 │ Campaign 6    │     116609.0 │
│ 2021-04-05 │ YCQJ24855263004453 │ Campaign 8    │     372033.0 │
│ 2021-12-06 │ BDXX17521802016941 │ Campaign 4    │     101008.0 │
└────────────┴────────────────────┴───────────────┴──────────────┘



# Perform The Joining Tables

In [None]:
# Perform the JOIN tables query for aggregated data
perform_join_tables_query = f"""
-- Perform the joins without aggregation
WITH OmniRaw AS (
  SELECT DISTINCT
    date,
    campaign_id,
    adset_id,
    ad_id,
    cpas_view_content,
    cpas_add_to_cart,
    cpas_purchase,
    cpas_purchase_value
  FROM
    Omni
),

ReachRaw AS (
  SELECT DISTINCT
    date,
    campaign_id,
    campaign_name,
    weekly_reach
  FROM
    Reach
)

-- Perform the left joins
SELECT DISTINCT
  Main.*,
  OA.cpas_view_content,
  OA.cpas_add_to_cart,
  OA.cpas_purchase,
  OA.cpas_purchase_value,
  RA.weekly_reach
FROM
  Main
LEFT JOIN
  OmniRaw AS OA
ON
  Main.date = OA.date AND
  Main.campaign_id = OA.campaign_id AND
  Main.adset_id = OA.adset_id AND
  Main.ad_id = OA.ad_id
LEFT JOIN
  ReachRaw AS RA
ON
  Main.date = RA.date AND
  Main.campaign_id = RA.campaign_id AND
  Main.campaign_name = RA.campaign_name;
"""

# Execute the query to perform the JOIN tables
joined_data = conn.sql(perform_join_tables_query).pl()

In [None]:
# Verify the insertion by selecting all data
joined_data

date,campaign_id,campaign_name,adset_id,adset_name,ad_id,ad_name,objective,adset_optimization_goal,publisher_platform,platform_position,impressions,clicks,video_view,spend,cpas_view_content,cpas_add_to_cart,cpas_purchase,cpas_purchase_value,weekly_reach
date,str,str,str,str,str,str,str,str,str,str,f32,f32,f32,f32,f32,f32,f32,f32,f32
2021-07-19,"""ZIKL52506461217209""","""Campaign 1""","""FYBK58684545832072""","""Adset 3""","""ZEYD59069770078328""","""Ad 1""","""PRODUCT_CATALOG_SALES""",,"""facebook""","""feed""",429.0,10.0,,4770.0,30.0,2.0,2.0,197790.0,337856.0
2021-07-19,"""ZIKL52506461217209""","""Campaign 1""","""FYBK58684545832072""","""Adset 3""","""ZEYD59069770078328""","""Ad 1""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_explore""",9.0,0.0,,57.0,30.0,2.0,2.0,197790.0,337856.0
2021-07-19,"""ZIKL52506461217209""","""Campaign 1""","""FYBK58684545832072""","""Adset 3""","""ZEYD59069770078328""","""Ad 1""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_stories""",94.0,4.0,,1389.0,30.0,2.0,2.0,197790.0,337856.0
2021-07-19,"""ZIKL52506461217209""","""Campaign 1""","""CKLV63235280102580""","""Adset 4""","""YPJV82477698871091""","""Ad 4""","""PRODUCT_CATALOG_SALES""",,"""facebook""","""feed""",9838.0,298.0,,60012.0,91.0,15.0,3.0,382200.0,337856.0
2021-07-26,"""ZIKL52506461217209""","""Campaign 1""","""FYBK58684545832072""","""Adset 3""","""ZEYD59069770078328""","""Ad 1""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_stories""",28.0,1.0,,550.0,9.0,2.0,,,178718.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2021-04-14,"""YCQJ24855263004453""","""Campaign 8""","""GCZB40399311306933""","""Adset 42""","""TBXM33017164372430""","""Ad 90""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_explore""",3.0,0.0,,38.0,,,,,
2021-04-27,"""YCQJ24855263004453""","""Campaign 8""","""FUQD10328858240955""","""Adset 43""","""JPLK42063488061912""","""Ad 87""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""feed""",163.0,0.0,,2067.0,2.0,,,,
2021-04-27,"""YCQJ24855263004453""","""Campaign 8""","""FUQD10328858240955""","""Adset 43""","""JPLK42063488061912""","""Ad 87""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_explore""",98.0,0.0,,501.0,2.0,,,,
2021-04-27,"""YCQJ24855263004453""","""Campaign 8""","""FUQD10328858240955""","""Adset 43""","""JPLK42063488061912""","""Ad 87""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_stories""",137.0,0.0,,2873.0,2.0,,,,


# Perform the Data Transformation

**Basic Data Inspection**

In [None]:
# View the first few rows
print(joined_data.head())

# Get the shape of the DataFrame
print(f"Rows: {joined_data.height}, Columns: {joined_data.width}")

# Get column names and data types
print(joined_data.dtypes)

shape: (5, 20)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ date      ┆ campaign_ ┆ campaign_ ┆ adset_id  ┆ … ┆ cpas_add_ ┆ cpas_purc ┆ cpas_purc ┆ weekly_r │
│ ---       ┆ id        ┆ name      ┆ ---       ┆   ┆ to_cart   ┆ hase      ┆ hase_valu ┆ each     │
│ date      ┆ ---       ┆ ---       ┆ str       ┆   ┆ ---       ┆ ---       ┆ e         ┆ ---      │
│           ┆ str       ┆ str       ┆           ┆   ┆ f32       ┆ f32       ┆ ---       ┆ f32      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆ f32       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2021-07-1 ┆ ZIKL52506 ┆ Campaign  ┆ FYBK58684 ┆ … ┆ 2.0       ┆ 2.0       ┆ 197790.0  ┆ 337856.0 │
│ 9         ┆ 461217209 ┆ 1         ┆ 545832072 ┆   ┆           ┆           ┆           ┆          │
│ 2021-07-1 ┆ ZIKL52506 ┆ Campaign  ┆ FYBK58684 ┆ … ┆ 2.0       ┆ 2.0       

**Summary Statistics**

In [None]:
# Get summary statistics for numerical columns
summary = joined_data.describe()

html_output = summary._repr_html_()
display(HTML(html_output))

statistic,date,campaign_id,campaign_name,adset_id,adset_name,ad_id,ad_name,objective,adset_optimization_goal,publisher_platform,platform_position,impressions,clicks,video_view,spend,cpas_view_content,cpas_add_to_cart,cpas_purchase,cpas_purchase_value,weekly_reach
str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""8994""","""8994""","""8994""","""8994""","""8994""","""8994""","""8994""","""8994""","""0""","""8994""","""8994""",8980.0,8980.0,800.0,8994.0,7489.0,4003.0,2152.0,2152.0,1256.0
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""8994""","""0""","""0""",14.0,14.0,8194.0,0.0,1505.0,4991.0,6842.0,6842.0,7738.0
"""mean""","""2021-08-01 13:33:49.352000""",,,,,,,,,,,1766.947998,14.208909,20.626249,15906.790039,25.775537,5.170872,2.238383,374773.78125,234227.046875
"""std""",,,,,,,,,,,,7873.345215,51.811802,51.162472,55238.222656,46.507904,8.454617,2.742024,605805.75,123747.546875
"""min""","""2021-04-01""","""ALSE30456130108025""","""Campaign 1""","""AXBA07175406606962""","""Adset 1""","""ABIC80379539086373""","""Ad 1""","""PRODUCT_CATALOG_SALES""",,"""audience_network""","""an_classic""",0.0,0.0,1.0,0.0,1.0,1.0,1.0,500.0,68445.0
"""25%""","""2021-06-12""",,,,,,,,,,,7.0,0.0,2.0,102.0,4.0,1.0,1.0,112200.0,136544.0
"""50%""","""2021-08-01""",,,,,,,,,,,61.0,0.0,5.0,803.0,11.0,2.0,1.0,192876.0,207736.0
"""75%""","""2021-09-11""",,,,,,,,,,,480.0,4.0,18.0,7033.620117,27.0,5.0,2.0,397705.0,337856.0
"""max""","""2022-01-03""","""ZIKL52506461217209""","""Campaign 8""","""ZULD17256236552135""","""Adset 9""","""ZZTZ78780854405458""","""Ad 92""","""PRODUCT_CATALOG_SALES""",,"""unknown""","""video_feeds""",163740.0,685.0,524.0,1069400.0,455.0,87.0,27.0,7882248.0,553008.0


**Check for Missing Values**

In [None]:
# Count of null values per column
null_counts = joined_data.null_count()

html_output = null_counts._repr_html_()
display(HTML(html_output))

date,campaign_id,campaign_name,adset_id,adset_name,ad_id,ad_name,objective,adset_optimization_goal,publisher_platform,platform_position,impressions,clicks,video_view,spend,cpas_view_content,cpas_add_to_cart,cpas_purchase,cpas_purchase_value,weekly_reach
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,8994,0,0,14,14,8194,0,1505,4991,6842,6842,7738


In [None]:
# Count of null values per column
null_counts = joined_data.select([
    pl.col('adset_optimization_goal').is_null().sum().alias('adset_optimization_goal_nulls'),
    pl.col('video_view').is_null().sum().alias('video_view_nulls'),
    pl.col('cpas_view_content').is_null().sum().alias('cpas_view_content_nulls'),
    pl.col('cpas_add_to_cart').is_null().sum().alias('cpas_add_to_cart_nulls'),
    pl.col('cpas_purchase').is_null().sum().alias('cpas_purchase_nulls'),
    pl.col('cpas_purchase_value').is_null().sum().alias('cpas_purchase_value_nulls'),
    pl.col('weekly_reach').is_null().sum().alias('weekly_reach_nulls')
])

html_output = null_counts._repr_html_()
display(HTML(html_output))

adset_optimization_goal_nulls,video_view_nulls,cpas_view_content_nulls,cpas_add_to_cart_nulls,cpas_purchase_nulls,cpas_purchase_value_nulls,weekly_reach_nulls
u32,u32,u32,u32,u32,u32,u32
8994,8194,1505,4991,6842,6842,7738


**Data Transformation**

In [None]:
# Set missing values to zero
transformed_data = joined_data.with_columns([
    pl.col('video_view').fill_null(0),
    pl.col('cpas_view_content').fill_null(0),
    pl.col('cpas_add_to_cart').fill_null(0),
    pl.col('cpas_purchase').fill_null(0),
    pl.col('cpas_purchase_value').fill_null(0),
    pl.col('weekly_reach').fill_null(0)
])

In [None]:
transformed_data

date,campaign_id,campaign_name,adset_id,adset_name,ad_id,ad_name,objective,adset_optimization_goal,publisher_platform,platform_position,impressions,clicks,video_view,spend,cpas_view_content,cpas_add_to_cart,cpas_purchase,cpas_purchase_value,weekly_reach
date,str,str,str,str,str,str,str,str,str,str,f32,f32,f32,f32,f32,f32,f32,f32,f32
2021-07-19,"""ZIKL52506461217209""","""Campaign 1""","""FYBK58684545832072""","""Adset 3""","""ZEYD59069770078328""","""Ad 1""","""PRODUCT_CATALOG_SALES""",,"""facebook""","""feed""",429.0,10.0,0.0,4770.0,30.0,2.0,2.0,197790.0,337856.0
2021-07-19,"""ZIKL52506461217209""","""Campaign 1""","""FYBK58684545832072""","""Adset 3""","""ZEYD59069770078328""","""Ad 1""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_explore""",9.0,0.0,0.0,57.0,30.0,2.0,2.0,197790.0,337856.0
2021-07-19,"""ZIKL52506461217209""","""Campaign 1""","""FYBK58684545832072""","""Adset 3""","""ZEYD59069770078328""","""Ad 1""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_stories""",94.0,4.0,0.0,1389.0,30.0,2.0,2.0,197790.0,337856.0
2021-07-19,"""ZIKL52506461217209""","""Campaign 1""","""CKLV63235280102580""","""Adset 4""","""YPJV82477698871091""","""Ad 4""","""PRODUCT_CATALOG_SALES""",,"""facebook""","""feed""",9838.0,298.0,0.0,60012.0,91.0,15.0,3.0,382200.0,337856.0
2021-07-26,"""ZIKL52506461217209""","""Campaign 1""","""FYBK58684545832072""","""Adset 3""","""ZEYD59069770078328""","""Ad 1""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_stories""",28.0,1.0,0.0,550.0,9.0,2.0,0.0,0.0,178718.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2021-04-14,"""YCQJ24855263004453""","""Campaign 8""","""GCZB40399311306933""","""Adset 42""","""TBXM33017164372430""","""Ad 90""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_explore""",3.0,0.0,0.0,38.0,0.0,0.0,0.0,0.0,0.0
2021-04-27,"""YCQJ24855263004453""","""Campaign 8""","""FUQD10328858240955""","""Adset 43""","""JPLK42063488061912""","""Ad 87""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""feed""",163.0,0.0,0.0,2067.0,2.0,0.0,0.0,0.0,0.0
2021-04-27,"""YCQJ24855263004453""","""Campaign 8""","""FUQD10328858240955""","""Adset 43""","""JPLK42063488061912""","""Ad 87""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_explore""",98.0,0.0,0.0,501.0,2.0,0.0,0.0,0.0,0.0
2021-04-27,"""YCQJ24855263004453""","""Campaign 8""","""FUQD10328858240955""","""Adset 43""","""JPLK42063488061912""","""Ad 87""","""PRODUCT_CATALOG_SALES""",,"""instagram""","""instagram_stories""",137.0,0.0,0.0,2873.0,2.0,0.0,0.0,0.0,0.0


**Save the transformed data**

In [None]:
# Save the DataFrame to a CSV file
transformed_data.write_csv('/content/drive/MyDrive/GroupM Technical Test/data/transformed_data.csv')
print("Data has been saved to '/content/drive/MyDrive/GroupM Technical Test/data/transformed_data.csv'")

Data has been saved to '/content/drive/MyDrive/GroupM Technical Test/data/transformed_data.csv'
