# TikTokers (TikTok Hackathon 2023)
## Problem Set 1: Optimise Advertisement Moderation

## Import Libraries

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

In [10]:
from ad_scorer import compute_risk_scores
from preprocessing import clean_ads
from simulator import simulate

## Preprocessing step

In [11]:

path = "../data/data.xlsx"
ads_table = pd.read_excel(path, sheet_name = 0, header = 1)
mod_table = pd.read_excel(path, sheet_name = 1)

ads_table = clean_ads(ads_table)

ads_table.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ads['punish_num'] = ads['punish_num'].fillna(mode_by_group)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ads['start_time'] = ads['start_time'].fillna(datetime.datetime(2000, 1, 1, 0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ads['days_since_last_penalty'] = (
A value is trying to be set on 

Unnamed: 0,ad_id,baseline_st,punish_num,ad_revenue,avg_ad_revenue,revenue_ratio,days_since_last_penalty,days_since_start_time
count,28119.0,28119.0,28119.0,28119.0,28119.0,28119.0,28119.0,28119.0
mean,1772808000000000.0,1.273209,1.009566,78.868235,27.205505,7.225628,8166.264056,7.25641
std,2791198000000.0,0.629597,0.218338,505.321665,173.458473,266.623935,1919.437127,30.750441
min,1714641000000000.0,0.54,1.0,0.0001,0.0001,5e-06,0.0,-1.0
25%,1773293000000000.0,0.54,1.0,1.81,1.25685,0.861865,8619.0,-1.0
50%,1773511000000000.0,1.53,1.0,6.68,3.3912,1.869176,8619.0,0.0
75%,1773558000000000.0,1.78,1.0,21.63325,10.808,3.978838,8619.0,2.0
max,1773616000000000.0,7.59,16.0,27100.62,17144.4986,43645.985507,8619.0,649.0


## Scale features

In [12]:
ad_cols = ['baseline_st', 'punish_num', 'ad_revenue', 'avg_ad_revenue', 'revenue_ratio', 'days_since_last_penalty', 'days_since_start_time']
scaler_ads = StandardScaler()
X_ads = scaler_ads.fit_transform(
    ads_table[ad_cols].values)

mod_cols = ['handling time'] # define relevant columns for moderators
scaler_mods = StandardScaler()
X_mods = scaler_mods.fit_transform(mod_table[mod_cols].values)

## Define the Loss Function

In [13]:
def loss(ads: pd.DataFrame, mods: pd.DataFrame):
    # Loss is determined by combination of results obtained from the simulation.
    mismatch, utilization = simulate(ads, mods) # mismatch was chosen as a composite weakens our optimisation
    return mismatch

## Run the Simulation
Monte Carlo simulation of the advertisement allocation process. From the simulations we will take the pair of models that arrived at the best possible result based on our loss function.

In [14]:
max_iterations = 50 # ideally at least 10,000 iteration
best_loss = float('inf')  # or -float('inf') if you're maximizing
best_gmms = None # will be a tuple of the best models for ads and moderators after running all simulations
tolerance = 1e-5
n_components = 3 # use if using GaussianMixture instead of BayesianGaussianMixture

previous_loss = best_loss

for i in range(max_iterations):
    gmm_ads = GaussianMixture(n_components=n_components)
    gmm_ads.fit(X_ads)

    gmm_mods = GaussianMixture(n_components=n_components)
    gmm_mods.fit(X_mods)

    ad_risk_scores = compute_risk_scores(X_ads, gmm_ads)
    mod_scores = compute_risk_scores(X_mods, gmm_mods)
    
    ads_table['score'] = ad_risk_scores
    mod_table['score'] = mod_scores
    current_loss = loss(ads_table, mod_table) # run simulation based on the dataframe of ads and moderators with the assigned scores
    print(f"Iteration: {i}    Loss: {current_loss}")

    # if abs(current_loss - previous_loss) < tolerance:
    #     break

    if current_loss < best_loss:  # Use > if you're maximizing the loss
        best_loss = current_loss
        best_gmms = (gmm_ads, gmm_mods)

    previous_loss = current_loss

# best_gmms now contains the models with the lowest observed loss
print(f"Best GMM models achieved loss of: {best_loss}")

Iteration: 0    Loss: 0.0020876946155807077
Iteration: 1    Loss: 0.003952888521125185
Iteration: 2    Loss: 0.0034449802642830918
Iteration: 3    Loss: 0.002460364007339565
Iteration: 4    Loss: 0.004485756222468353
Iteration: 5    Loss: 0.0020876946155807094
Iteration: 6    Loss: 0.004196048953070052
Iteration: 7    Loss: 0.003599027052743421
Iteration: 8    Loss: 0.005700346908566392
Iteration: 9    Loss: 0.0026518002175135225
Iteration: 10    Loss: 0.00528011031169566
Iteration: 11    Loss: 0.003977046600137968
Iteration: 12    Loss: 0.0013409645889018201
Iteration: 13    Loss: 0.003645369989557914
Iteration: 14    Loss: 0.0020876946155807077
Iteration: 15    Loss: 0.003649309064282152
Iteration: 16    Loss: 0.0036235127928024003
Iteration: 17    Loss: 0.0036235127928024007
Iteration: 18    Loss: 0.0029053604239132696
Iteration: 19    Loss: 0.0020876946155807094
Iteration: 20    Loss: 0.0026733703045880054
Iteration: 21    Loss: 0.0041030883184501625
Iteration: 22    Loss: 0.004948

In [22]:
# best risk scores for ads
compute_risk_scores(X_ads, best_gmms[0])

array([1., 1., 1., ..., 1., 1., 1.])

In [24]:
# best risk scores for mods
compute_risk_scores(X_mods, best_gmms[1])

array([0.99945387, 0.99950569, 0.99972071, ..., 0.99976193, 0.99973558,
       0.99865813])