# TikTokers (TikTok Hackathon 2023)
## Problem Set 1: Optimise Advertisement Moderation

## Import Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

In [2]:
from ad_scorer import compute_risk_scores
from preprocessing import clean_ads
from simulator import simulate

## Preprocessing step

In [3]:

path = "../data/data.xlsx"
ads_table = pd.read_excel(path, sheet_name = 0, header = 1)
mod_table = pd.read_excel(path, sheet_name = 1)

ads_table = clean_ads(ads_table)

ads_table.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ads['punish_num'] = ads['punish_num'].fillna(mode_by_group)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ads['start_time'] = ads['start_time'].fillna(datetime.datetime(2000, 1, 1, 0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ads['days_since_last_penalty'] = (
A value is trying to be set on 

Unnamed: 0,ad_id,baseline_st,punish_num,ad_revenue,avg_ad_revenue,revenue_ratio,days_since_last_penalty,days_since_start_time
count,28119.0,28119.0,28119.0,28119.0,28119.0,28119.0,28119.0,28119.0
mean,1772808000000000.0,1.273209,1.009566,78.868235,27.205505,7.225628,8166.264056,7.25641
std,2791198000000.0,0.629597,0.218338,505.321665,173.458473,266.623935,1919.437127,30.750441
min,1714641000000000.0,0.54,1.0,0.0001,0.0001,5e-06,0.0,-1.0
25%,1773293000000000.0,0.54,1.0,1.81,1.25685,0.861865,8619.0,-1.0
50%,1773511000000000.0,1.53,1.0,6.68,3.3912,1.869176,8619.0,0.0
75%,1773558000000000.0,1.78,1.0,21.63325,10.808,3.978838,8619.0,2.0
max,1773616000000000.0,7.59,16.0,27100.62,17144.4986,43645.985507,8619.0,649.0


## Scale features

In [4]:
ad_cols = ['baseline_st', 'punish_num', 'ad_revenue', 'avg_ad_revenue', 'revenue_ratio', 'days_since_last_penalty', 'days_since_start_time']
scaler_ads = StandardScaler()
X_ads = scaler_ads.fit_transform(
    ads_table[ad_cols].values)

mod_cols = ['handling time'] # define relevant columns for moderators
scaler_mods = StandardScaler()
X_mods = scaler_mods.fit_transform(mod_table[mod_cols].values)

## Define the Loss Function

In [5]:
print(ads_table)

                  ad_id delivery_country queue_market  baseline_st  \
0      1747578422390810               US         USCA         1.78   
1      1758543228094480               SA         MENA         1.67   
2      1738303151826990               AU           AU         1.84   
4      1738304016410620               NZ           NZ         1.58   
5      1738304421208090               NZ           NZ         1.58   
...                 ...              ...          ...          ...   
33574  1773614002018300               ID          NaN         0.54   
33576  1773613941479420               ID          NaN         0.54   
33580  1773550979715070               VN          NaN         0.54   
33585  1773599824579580               US          NaN         0.54   
33591  1773551094679610               VN          NaN         0.54   

       punish_num  ad_revenue  avg_ad_revenue  revenue_ratio  \
0             1.0   5132.1700       4795.2500       1.070261   
1             1.0   1976.2500  

In [6]:
print(mod_table)

             moderator                                             market  \
0     1689841547143170  ["SA", "OM", "BH", "QA", "JO", "IQ", "KW", "EG...   
1     1686755036370945                                             ["RO"]   
2     1741913197768705                                             ["VN"]   
3     1743415203890193   ["MX", "UY", "CL", "AR", "CO", "EC", "PE", "BR"]   
4     1710346282427393                                             ["PH"]   
...                ...                                                ...   
1409  1734911746733057                                       ["US", "CA"]   
1410  1694737747315714                                             ["TH"]   
1411  1730993346971650                                             ["RO"]   
1412  1707706649725953                                             ["GB"]   
1413  1772302280493057                                             ["GB"]   

      Productivity  Utilisation %  handling time              accuracy  
0 

In [7]:
def loss(ads: pd.DataFrame, mods: pd.DataFrame):
    # Loss is determined by combination of results obtained from the simulation.
    mismatch, utilization = simulate(ads, mods)
    return mismatch

## Run the Simulation
Monte Carlo simulation of the advertisement allocation process. From the simulations we will take the pair of models that arrived at the best possible result based on our loss function.

In [8]:
max_iterations = 10000
best_loss = float('inf')  # or -float('inf') if you're maximizing
best_gmms = None # will be a tuple of the best models for ads and moderators after running all simulations
tolerance = 1e-5
n_components = 3 # use if using GaussianMixture instead of BayesianGaussianMixture

previous_loss = best_loss

for i in range(max_iterations):
    gmm_ads = GaussianMixture(n_components=n_components)
    gmm_ads.fit(X_ads)

    gmm_mods = GaussianMixture(n_components=n_components)
    gmm_mods.fit(X_mods)

    ad_risk_scores = compute_risk_scores(X_ads, gmm_ads)
    mod_scores = compute_risk_scores(X_mods, gmm_mods)
    
    ads_table['score'] = ad_risk_scores
    mod_table['score'] = mod_scores
    current_loss = loss(ads_table, mod_table) # run simulation based on the dataframe of ads and moderators with the assigned scores
    print(f"Iteration: {i}    Loss: {current_loss}")

    # if abs(current_loss - previous_loss) < tolerance:
    #     break

    if current_loss < best_loss:  # Use > if you're maximizing the loss
        best_loss = current_loss
        best_gmms = (gmm_ads, gmm_mods)

    previous_loss = current_loss

# best_gmms now contains the models with the lowest observed loss
print(f"Best GMM models achieved loss of: {best_loss}")

Iteration: 0    Loss: 0.003599027052743421
Iteration: 1    Loss: 0.003977046600137968
Iteration: 2    Loss: 0.0020876946155807072
Iteration: 3    Loss: 0.0029053604239132696
Iteration: 4    Loss: 0.0026733703045880054
Iteration: 5    Loss: 0.0036235127928024007
Iteration: 6    Loss: 0.0013409645889018201
Iteration: 7    Loss: 0.0036235127928024007
Iteration: 8    Loss: 0.0020876946155807072
Iteration: 9    Loss: 0.005700346908566388
Iteration: 10    Loss: 0.0029053604239132696
