In [1]:
import pandas as pd
import gensim
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
from geopy import distance

import sklearn
from sklearn.preprocessing import OneHotEncoder, normalize
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import train_test_split
import sklearn.metrics
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, RocCurveDisplay, f1_score, recall_score, precision_score
from sklearn import preprocessing
import xlsxwriter
from sklearn.model_selection import GridSearchCV, KFold
import xgboost as xgb
import multiprocessing

from joblib import dump, load

In [2]:
hcp_df = pd.read_csv('hcp_segment_37webs.csv')

In [3]:
def derive_topics(text: str) -> set:
    words = gensim.utils.simple_preprocess(text)
    words_prep = [preprocess_string(remove_stopwords(word)) for word in words]
    return set([w[0] for w in words_prep if w])

In [39]:
webinar = {'webinar_name_original': ['Management of Hypothyroidism in Primary Health Care Workshop - Hybrid event'], 'webinar_latitude': [21.3866249], 'webinar_longitude': [51.6560391], 'day_of_year': [341], 'day_of_week': [4], 'webinar_duration': [240], 'topics': ['To discuss in general the primary Hypothyroidism.To discuss the classification & diagnosis of hypothyroidism and its management in primary health care. To discuss the Management of thyroid disease in pregnancy.'], '_segment_key': ['CM&E']}

In [38]:
webinar = {'webinar_name_original': ['Merck FERTI Cases'], 'webinar_latitude': [28.0], 'webinar_longitude': [3.0], 'day_of_year': [341], 'day_of_week': [4], 'webinar_duration': [90], 'topics': ['The objective of this program is to explore with an international expert in fertility, complex clinical cases of Medically Assisted Procreation in a unique interactive format as close as possible to reality.'], '_segment_key': ['Fertility']}

In [5]:
webinar_df = pd.DataFrame.from_dict(webinar)

In [6]:
webinar_df

Unnamed: 0,webinar_name_original,webinar_latitude,webinar_longitude,day_of_year,day_of_week,webinar_duration,topics
0,Management of Hypothyroidism in Primary Health...,21.386625,51.656039,341,4,240,To discuss in general the primary Hypothyroidi...


In [7]:
webinar_df['topics'] = webinar_df['topics'].apply(lambda x: derive_topics(x + webinar['webinar_name_original'][0]))

Bring both DFs together

In [8]:
# list of all hcps
hcps = hcp_df['_account_key'].drop_duplicates()

In [9]:
# duplicate webiar row for each hcp
webinar_df = pd.concat([webinar_df]*len(hcps), ignore_index=True)

In [10]:
webinar_df['_account_key'] = hcps

In [35]:
webinar_df['_segment_key'] = webinar['_segment_key']

In [36]:
train_data = hcp_df.merge(webinar_df, on='_account_key', how='inner')

In [37]:
train_data.shape

(215278, 30)

### Derive needed features

In [248]:
# to make after merge
# distance
# segment_intersection
# main_interests_score
# similar_interests_score

#### Distance

In [14]:
train_data.columns

Index(['_account_key', 'account_country', 'account_latitude',
       'account_longitude', 'account_status', 'account_id', 'label_x',
       'label_y', 'num_webs', 'num_key_messages', 'main_interests',
       'similar_interests', 'Conventionalists', 'Digitally Engaged',
       'Engagers', 'Low Engagers', 'Not Segmented', 'CM&E', 'Fertility',
       'Multi-Franchise', 'Neurology & Immunology', 'Oncology',
       'webinar_name_original', 'webinar_latitude', 'webinar_longitude',
       'day_of_year', 'day_of_week', 'webinar_duration', 'topics',
       '_segment_key'],
      dtype='object')

In [15]:
dist_cache = {}
def make_distance(c1: tuple, c2: tuple) -> float:
    d = dist_cache.get(str(c1 + c2))
    if d is None:
        d = round(float(distance.distance(c1, c2).km), 2)
        dist_cache[str(c1 + c2)] = d
    return d

In [16]:
train_data['distance'] = train_data.apply(lambda row: make_distance((row['account_latitude'], row['account_longitude']), (row['webinar_latitude'], row['webinar_longitude'])), axis=1)

#### segment_intersection

In [17]:
train_data['segment_intersection'] = train_data[webinar['_segment_key']]

#### main_interests_score

In [18]:
def calc_score(v1: set, v2: set) -> int:
    return sum([1 for v in v2 if v in str(v1)])

In [19]:
train_data['main_interests_score'] = [calc_score(i, j) for i, j in zip(train_data['main_interests'], train_data['topics'])]

In [20]:
train_data['similar_interests_score'] = [calc_score(i, j) for i, j in zip(train_data['similar_interests'], train_data['topics'])]

In [21]:
len(train_data.columns)

34

#### Modeling

In [22]:
def scale(x: pd.Series):
    scaler = preprocessing.MinMaxScaler()
    arr = np.array(x).reshape(-1, 1)
    scaler.fit(arr)
    x = scaler.transform(arr)
    return x.reshape(1,-1).tolist()[0]

In [23]:
def fall_out_score(y_test, y_predict):
    confusion = confusion_matrix(y_test, y_predict)
    tn = confusion[0][0]
    fp = confusion[0][1]
    return fp/(fp+tn)

In [24]:
cols_to_drop = [
    '_account_key',
    'account_latitude',
    'account_longitude',
    'webinar_latitude',
    'webinar_longitude',
    'account_country',
    'account_id',
    '_segment_key',
    'main_interests',
    'similar_interests',
    'topics',
    'num_webs',
    'webinar_name_original',
    'account_id',
    'account_country',
    'Fertility', 'CM&E', 'Multi-Franchise', 'Neurology & Immunology', 'Oncology',
]

n_kfolds = 5
max_depth = 20
growth_policy = 'depthwise'
eval_metric = fall_out_score
test_size = 0.3

In [25]:
franchises = ['Fertility', 'CM&E', 'Multi-Franchise', 'Neurology & Immunology', 'Oncology']

train_dfs = {}

for f in franchises:
    train_dfs[f] = train_data[train_data[f] == 1]
train_dfs['whole_df'] = train_data

In [32]:
y_predict = {}
for name, df in train_dfs.items():
    X = df.drop(columns=cols_to_drop)
    X.fillna(0, inplace=True)
    X = X[X['webinar_duration'] != 0]
    
    #scaler = preprocessing.MinMaxScaler()
    #scaler.fit(X)
    #X = scaler.transform(X)
    
    model_path = f'XGB_RF_37Webs_distance_fallout.joblib'
    model = load(model_path)
    y_predict[name] = model.predict(X)

In [33]:
num_predicts = {k: sum(v) for k,v in y_predict.items()}

In [34]:
num_predicts

{'Fertility': 733,
 'CM&E': 1182,
 'Multi-Franchise': 0,
 'Neurology & Immunology': 8,
 'Oncology': 16,
 'whole_df': 1939}

In [179]:
sum(y_predict['CM&E_4'])

0

In [226]:
prediction = train_dfs['Fertility'][[True if i == 1 else False for i in y_predict['Fertility_1']]]

In [227]:
prediction['account_country'].value_counts()

Algeria         381
Saudi Arabia    244
Germany         156
Name: account_country, dtype: int64

In [117]:
train_dfs['Fertility']['account_country'].value_counts()

Germany         31389
Algeria          5739
Saudi Arabia     2436
Name: account_country, dtype: int64

In [228]:
predicted_hcps = prediction['_account_key'].drop_duplicates()

Check

In [229]:
train_data_dtypes = {'_account_key': 'string',
 'account': 'string',
 'account_city': 'string',
 'account_country': 'string',
 'account_country_code': 'string',
 'account_email': 'string',
 'account_inkl_onekey': 'string',
 'account_latitude': 'Float64',
 'account_longitude': 'Float64',
 'account_state': 'Int64',
 'account_status': 'string',
 'account_type': 'string',
 'account_id': 'string',
 'digital_affinity': 'string',
 'digital_segmentation': 'string',
 'gender': 'Int64',
 'hcp_franchise': 'string',
 'hcp_therapeutic_area': 'string',
 'ispersonaccount': 'boolean',
 'onekey_id': 'string',
 'specialty_1': 'string',
 'specialty_2': 'string',
 'veeva_id': 'string',
 'account_brick': 'string',
 'account_territories': 'string',
 'account_territories_per_bu': 'string',
 '_account_key_fdweb': 'string',
 'account_key_original': 'string',
 'account_id_na': 'boolean',
 '_webinar_key': 'Int64',
 '_calendar_key': 'Int64',
 '_product_key': 'Int64',
 '_segment_key': 'string',
 '_country_key': 'string',
 '_channel_key': 'string',
 '_campaign_key': 'string',
 'account_cnt': 'string',
 'webinar_participants_cnt': 'Int64',
 'event_allchannels_cnt': 'string',
 '_fact_allchannels_event': 'Int64',
 '_has_campaign_id': 'Int64',
 '_webinar_has_web_campaign': 'Int64',
 'webinar_cnt': 'Int64',
 '_timeline_key': 'string',
 'attendence_type': 'string',
 '_attended': 'Int64',
 '_fulltimeattended': 'Int64',
 'stream_time': 'Float64',
 'user_rating': 'string',
 'user_benefit_rate': 'Int64',
 'source_system': 'string',
 '_mslteam': 'Int64',
 '_datasource_f_webinars': 'Int64',
 'fact_source_': 'string',
 '_fact_actual': 'Int64',
 '_testaccount': 'Int64',
 '_individualcustomer': 'Int64',
 '_function_key': 'string',
 'fact_function': 'string',
 '_franchiseviewtimeline_key': 'string',
 '_remoteevent': 'Int64',
 '_f2fevent': 'Int64',
 'fact_country': 'string',
 'webinar_fileextension': 'string',
 'webinar_filename': 'string',
 'webinar_duration': 'Int64',
 'webinar_name': 'string',
 'webinar_platform': 'string',
 'account_id_fdweb': 'string',
 'date': 'string',
 'webinar_latitude': 'Float64',
 'webinar_longitude': 'Float64',
 'key_message': 'string'}

In [230]:
train_data_full = pd.read_csv('fd_dacc_km_28-11-2022.csv', low_memory=True, dtype=train_data_dtypes)

In [231]:
predicted_hcps = train_data_full[train_data_full['_account_key'].isin(predicted_hcps)]

In [236]:
predicted_hcps['hcp_franchise'].unique()

<StringArray>
['Fertility']
Length: 1, dtype: string

In [247]:
predicted_hcps.to_csv('Prediction_Merck_FERTI\'Cases', index=False)