In [14]:
import pandas as pd
import numpy as np
import random
import warnings
from sklearn.mixture import BayesianGaussianMixture
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")
pd.set_option('expand_frame_repr', False)

In [8]:
path = "C:/Users/chial/Downloads/data.xlsx" 
ads_table = pd.read_excel(path, sheet_name = 0, header = 1)
mod_table = pd.read_excel(path, sheet_name = 1)

In [9]:
mod_table = mod_table[~(mod_table['Productivity'].isna() & mod_table['Utilisation %'].isna() & (mod_table['handling time'] == 0))]
mod_table['accuracy'] = pd.to_numeric(mod_table['accuracy'], errors='coerce')
mod_table = mod_table.dropna(subset=['accuracy'])
mod_table = mod_table.reset_index(drop=True)

Unnamed: 0,moderator,market,Productivity,Utilisation %,handling time,accuracy
0,1686755036370945,"[""RO""]",174.560000,0.704833,25547,0.959
1,1741913197768705,"[""VN""]",334.376000,0.819167,50794,1.000
2,1743415203890193,"[""MX"", ""UY"", ""CL"", ""AR"", ""CO"", ""EC"", ""PE"", ""BR""]",452.277500,0.775474,89626,0.941
3,1710346282427393,"[""PH""]",612.312500,0.843229,98066,0.887
4,1748544247125010,"[""MX"", ""UY"", ""CO"", ""CL"", ""AR"", ""EC"", ""PE"", ""BR""]",292.330000,0.678776,80656,0.739
...,...,...,...,...,...,...
1270,1694281660376066,"[""TR""]",273.432381,0.861444,64710,0.943
1271,1734911746733057,"[""US"", ""CA""]",180.750000,0.685469,79210,0.860
1272,1694737747315714,"[""TH""]",542.360000,0.890927,49664,0.958
1273,1730993346971650,"[""RO""]",320.960000,0.843042,73769,0.967


In [22]:
def compute_moderator_scores(X, gmm):
    responsibilities = gmm.predict_proba(X)
    moderator_scores = np.max(responsibilities, axis=1)
    return moderator_scores

# Your custom loss function
def loss(risk_scores):
    # Implement your loss function here based on the given risk scores
    return sum([risk_scores[i] for i in range(1000)])

scaler = StandardScaler()
X = scaler.fit_transform(mod_table.drop(columns=['moderator', 'market']).values)

best_loss = -float('inf')  # or -float('inf') if you're maximizing
best_gmm = None
tolerance = 1e-5
max_iterations = 10000
previous_loss = best_loss
# n_components = 3

for iteration in range(max_iterations):
    gmm = BayesianGaussianMixture()
    gmm.fit(X)
    
    current_risk_scores = compute_moderator_scores(X, gmm)
    current_loss = loss(current_risk_scores)

    if abs(current_loss - previous_loss) < tolerance:
        break
    
    if current_loss > best_loss:  # Use > if you're maximizing the loss
        best_loss = current_loss
        best_gmm = gmm

    previous_loss = current_loss

# best_gmm now contains the model with the lowest observed loss
final_moderator_scores = compute_moderator_scores(X, best_gmm)
final_moderator_scores

array([1., 1., 1., ..., 1., 1., 1.])

In [23]:
mod_table['Moderator Score'] = final_moderator_scores
mod_table

Unnamed: 0,moderator,market,Productivity,Utilisation %,handling time,accuracy,Moderator Score
0,1686755036370945,"[""RO""]",174.560000,0.704833,25547,0.959,1.0
1,1741913197768705,"[""VN""]",334.376000,0.819167,50794,1.000,1.0
2,1743415203890193,"[""MX"", ""UY"", ""CL"", ""AR"", ""CO"", ""EC"", ""PE"", ""BR""]",452.277500,0.775474,89626,0.941,1.0
3,1710346282427393,"[""PH""]",612.312500,0.843229,98066,0.887,1.0
4,1748544247125010,"[""MX"", ""UY"", ""CO"", ""CL"", ""AR"", ""EC"", ""PE"", ""BR""]",292.330000,0.678776,80656,0.739,1.0
...,...,...,...,...,...,...,...
1270,1694281660376066,"[""TR""]",273.432381,0.861444,64710,0.943,1.0
1271,1734911746733057,"[""US"", ""CA""]",180.750000,0.685469,79210,0.860,1.0
1272,1694737747315714,"[""TH""]",542.360000,0.890927,49664,0.958,1.0
1273,1730993346971650,"[""RO""]",320.960000,0.843042,73769,0.967,1.0


## Functions

In [10]:
# Generate score as in Problem A
# TO BE CHANGED
def fake_score():
    return random.random()

# Get list of Moderators operating in the targeted market 
def check_moderator_market(target_market):
    return mod_table[mod_table['market'].apply(lambda x: target_market in x)]

# Allocate risk score to the job
def tag_job_with_risk (job):
    if job['score']>0.75:
        return 2
    elif job['score']>0.25:
        return 1
    else: return 0

# Allocate moderator accuracy 
def tag_moderator_with_score(moderator_pool):
    high_score_cutoff = moderator_pool['Moderator Score'].quantile(0.75)
    mid_score_cutoff = moderator_pool['Moderator Score'].quantile(0.25)

    moderator_pool['Score Segment'] = np.where(moderator_pool['Moderator Score'] >= high_score_cutoff, 2,
                             np.where(moderator_pool['Moderator Score'] >= mid_score_cutoff, 1, 0))
    
# Shortlist moderators who are accurate enough for the review
def match_moderator_score_to_job(moderator_pool, job):
    moderator_pool.drop(moderator_pool[moderator_pool['Score Segment'] != job['Risk Segment']].index, inplace=True)
    
# Function to return number of jobs given ID of moderator
# TO BE CHANGED
def number_of_jobs(id):
    return 1

# Function to get number of jobs each moderator in the moderator pool has 
def number_of_jobs_for_each_moderator(moderator_pool):
    moderator_pool['Number of Jobs'] = 0
    moderator_pool['Number of Jobs'] = moderator_pool['Number of Jobs'].apply(lambda x: number_of_jobs(moderator_pool['moderator']))

def least_job_policy(moderator_pool):
    least_jobs_value = moderator_pool['Number of Jobs'].min()
    moderator = moderator_pool.loc[moderator_pool['Number of Jobs'] == least_jobs_value, 'moderator'].values[0]
    return moderator

def least_job_hour_policy(moderator_pool):
    moderator_pool['Number of Job Hours'] = 0
    moderator_pool['Number of Job Hours'] = moderator_pool['Number of Jobs']*moderator_pool['handling time']
    least_job_hour_value = moderator_pool['Number of Job Hours'].min()
    moderator = moderator_pool.loc[moderator_pool['Number of Job Hours'] == least_job_hour_value, 'moderator'].values[0]
    print(moderator_pool)
    return moderator

def assign_job_to_moderator(moderator_pool, job):
    job['Risk Segment'] = tag_job_with_risk(job)
    moderator_pool = check_moderator_market(job['delivery_country'])
    tag_moderator_with_score(moderator_pool)
    match_moderator_score_to_job(moderator_pool, job)
    number_of_jobs_for_each_moderator(moderator_pool)
    # return least_job_policy(moderator_pool)
    return least_job_hour_policy(moderator_pool)

In [11]:
# Get a job as a dataframe of 1 row 
ads_table['score'] = 0
ads_table['score'] = ads_table['score'].apply(lambda x: fake_score())
job = ads_table.iloc[0]

In [12]:
moderator_pool = mod_table
assign_job_to_moderator(moderator_pool, job)

             moderator        market  Productivity  Utilisation %  handling time  accuracy  Accuracy Segment  Number of Jobs  Number of Job Hours
50    1672502123346950  ["US", "CA"]    133.226667       0.537611          79835     0.789                 1               1                79835
53    1752289495219201  ["US", "CA"]    383.448000       0.805644          97686     0.911                 1               1                97686
64    1759961674413106        ["US"]    261.977500       0.808214          89468     0.792                 1               1                89468
74    1729842322829314  ["US", "CA"]    172.503333       0.744542          61756     0.832                 1               1                61756
84    1672130496087046  ["US", "CA"]    320.902500       0.646458          79979     0.847                 1               1                79979
...                ...           ...           ...            ...            ...       ...               ...             ...

1598527