# Spreadsheet columns
- Publisher - org displaying the ad
- Advertiser - brand paying for ad
- Campaign - marketing initiative
- Imps - number of times displayed regardless of views
- Viewable imps - times ad is visible on screen
- Clicks - num clicks
- Dsp total cost USD - cost for advertiser to place ads along with platform fees etc
- Dsp media cost USD - cost for advertiser to place ads
- Ssp media cost USD - fee (%) taken by publisher on any revenue generated by ads
    - is this separate to display side platform costs?
- Pc convs - conversions after click
- Total convs - conversions
- Adstxt verified imps - confirmed seen by real person

In [173]:
import numpy as np
import pandas as pd
import catboost as cb
import sklearn as sk


In [185]:
dataset = "../data/dataset.csv"
df = pd.read_csv(dataset, index_col=None, thousands=',')
df.dtypes

Unnamed: 0,Datetime,Publisher,Advertiser,Campaign,Imps,Viewable Imps,Clicks,Dsp Total Cost USD,Dsp Media Cost USD,Ssp Media Cost USD,...,Day of week,Hour of day,VerifiedImpRate,ConvPerVerImp,SspCostPerConv,SspCostPerVerImp,DspTotalCostPerConv,DspTotalCostPerVerImp,OverallCostPerConv,OverallCostPerVerImp
0,2021-04-18 05:00:00,Undisclosed publisher,Brand B,B7,234,0,0,17.89,15.88,11.45,...,6,5,0.713675,0.0,,0.068563,,0.107126,,0.175689
1,2021-04-18 05:00:00,Ushodaya Enterprises Pvt Ltd,Brand B,B5,27,0,0,1.65,1.47,1.12,...,6,5,0.222222,0.0,,0.186667,,0.275,,0.461667
2,2021-04-18 05:00:00,Network18,Brand B,B7,27,0,0,0.51,0.42,0.31,...,6,5,1.0,0.0,,0.011481,,0.018889,,0.03037
3,2021-04-18 05:00:00,Mail Metro Media,Brand B,B6,17,0,0,0.46,0.41,0.32,...,6,5,0.764706,0.0,,0.024615,,0.035385,,0.06
4,2021-04-18 05:00:00,Undisclosed publisher,Brand B,B5,107,0,1,3.26,2.82,2.05,...,6,5,0.738318,0.0,,0.025949,,0.041266,,0.067215


# Set dtypes

In [175]:
df["Datetime"] = pd.to_datetime(df["Datetime"])
df["Publisher"]
df.dtypes

# Convert datetime to day of week and hour of day

In [176]:
day, hour = df["Datetime"].dt.dayofweek, df["Datetime"].dt.hour

df["Day of week"] = day.astype(int)
df["Hour of day"] = hour.astype(int)

# Add verified impression rate

In [177]:
df["VerifiedImpRate"] = df["Adstxt Verified Imps"].astype(int) / df["Imps"].astype(int)

# Add conversion / verified impression rate

In [178]:
df["ConvPerVerImp"] = df["Total Convs"].astype(int) / df["Adstxt Verified Imps"].astype(int)

### Ssp rates

In [179]:
df["SspCostPerConv"] = df["Ssp Media Cost USD"].astype(float) / df["Total Convs"].astype(int)
df["SspCostPerVerImp"] = df["Ssp Media Cost USD"].astype(float) / df["Adstxt Verified Imps"].astype(int)

### Dsp rates

In [180]:
# Dsp total cost per conversion
df["DspTotalCostPerConv"] = df["Dsp Total Cost USD"].astype(float) / df["Total Convs"].astype(int)

# Dsp total cost per verified impression
df["DspTotalCostPerVerImp"] = df["Dsp Total Cost USD"].astype(float) / df["Adstxt Verified Imps"].astype(int)

# Overall rates

In [181]:
# Overall cost per conversion (dsp total and ssp)
df["OverallCostPerConv"] = df["DspTotalCostPerConv"].astype(float) + df["SspCostPerConv"].astype(float)

# cost per verified impression
df["OverallCostPerVerImp"] = df["DspTotalCostPerVerImp"].astype(float) + df["SspCostPerVerImp"].astype(float)

# Handle infinite values and save csv

In [182]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.to_csv(dataset, index=False)

In [183]:
df.head()

Unnamed: 0,Datetime,Publisher,Advertiser,Campaign,Imps,Viewable Imps,Clicks,Dsp Total Cost USD,Dsp Media Cost USD,Ssp Media Cost USD,...,Day of week,Hour of day,VerifiedImpRate,ConvPerVerImp,SspCostPerConv,SspCostPerVerImp,DspTotalCostPerConv,DspTotalCostPerVerImp,OverallCostPerConv,OverallCostPerVerImp
0,2021-04-18 05:00:00,Undisclosed publisher,Brand B,B7,234,0,0,17.89,15.88,11.45,...,6,5,0.713675,0.0,,0.068563,,0.107126,,0.175689
1,2021-04-18 05:00:00,Ushodaya Enterprises Pvt Ltd,Brand B,B5,27,0,0,1.65,1.47,1.12,...,6,5,0.222222,0.0,,0.186667,,0.275,,0.461667
2,2021-04-18 05:00:00,Network18,Brand B,B7,27,0,0,0.51,0.42,0.31,...,6,5,1.0,0.0,,0.011481,,0.018889,,0.03037
3,2021-04-18 05:00:00,Mail Metro Media,Brand B,B6,17,0,0,0.46,0.41,0.32,...,6,5,0.764706,0.0,,0.024615,,0.035385,,0.06
4,2021-04-18 05:00:00,Undisclosed publisher,Brand B,B5,107,0,1,3.26,2.82,2.05,...,6,5,0.738318,0.0,,0.025949,,0.041266,,0.067215
