In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, RocCurveDisplay, f1_score, recall_score, precision_score
from sklearn import preprocessing

import time
import gensim
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
from geopy import distance
from functools import cache

from joblib import dump, load

Functionality for setting up the data

In [98]:
hcp_df = pd.read_csv('hcp_segment_37webs.csv')

In [99]:
def derive_topics(text: str) -> set:
    words = gensim.utils.simple_preprocess(text)
    words_prep = [preprocess_string(remove_stopwords(word)) for word in words]
    return set([w[0] for w in words_prep if w])

@cache
def make_distance(c1: tuple, c2: tuple) -> float:
    d = round(float(distance.distance(c1, c2).km), 2)
    return d

def calc_score(v1: set, v2: set) -> int:
    return sum([1 for v in v2 if v in str(v1)])

def compile_dataset(hcp_df: pd.DataFrame, webinar: dict) -> pd.DataFrame:
    # create webinar_df
    webinar_df = pd.DataFrame.from_dict(webinar)
    webinar_df['topics'] = webinar_df['topics'].apply(lambda x: derive_topics(x + webinar['webinar_name_original'][0]))
    
    hcp_df = hcp_df[hcp_df[webinar['_segment_key'][0]] == 1]

    hcp_df.drop_duplicates(subset=['_account_key'], inplace=True)
    hcps = hcp_df['_account_key']

    webinar_df = pd.concat([webinar_df]*hcps.shape[0], ignore_index=True)
    webinar_df['_account_key'] = hcps
    del hcps

    # merge hcp_df and webinar_df
    result = hcp_df.merge(webinar_df, on='_account_key', how='inner')
    del hcp_df
    del webinar_df

    # Derive needed features
    start = time.time()
    result['distance'] = result.apply(lambda row: make_distance((row['account_latitude'], row['account_longitude']), (row['webinar_latitude'], row['webinar_longitude'])), axis=1)
    end = time.time()
    print(f'Distances: ', round(end - start, 2))
    result['segment_intersection'] = result[webinar['_segment_key'][0]]
    result['main_interests_score'] = [calc_score(i, j) for i, j in zip(result['main_interests'], result['topics'])]
    result['similar_interests_score'] = [calc_score(i, j) for i, j in zip(result['similar_interests'], result['topics'])]

    return result


Functionality for modeling

In [100]:
def scale(x: pd.Series):
    scaler = preprocessing.MinMaxScaler()
    arr = np.array(x).reshape(-1, 1)
    scaler.fit(arr)
    x = scaler.transform(arr)
    return x.reshape(1,-1).tolist()[0]

def fall_out_score(y_test, y_predict):
    confusion = confusion_matrix(y_test, y_predict)
    tn = confusion[0][0]
    fp = confusion[0][1]
    return fp/(fp+tn)

Settings for modeling

In [101]:
n_kfolds = 5
max_depth = 20
growth_policy = 'depthwise'
eval_metric = fall_out_score
test_size = 0.3
model_path = 'XGB_RF_37Webs_distance_fallout.joblib'

Get prediction

In [102]:
def get_prediction(df: pd.DataFrame, model_path: str) -> np.array:
    model = load(model_path)
    feature_names = model.feature_names
    # cols_to_drop = set(df.columns).intersection(set(feature_names))

    X = df[feature_names]
    if X.shape[1] != len(feature_names):
        raise Exception(f'Dimension missmatch: df - {X.shape[1]} columns, model - {len(feature_names)} columns')
    X.fillna(0, inplace=True)
    X = X[X['webinar_duration'] != 0]
    
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    
    return model.predict(X)

In [78]:
webinar_oman = {'webinar_name_original': ['Management of Hypothyroidism in Primary Health Care Workshop - Hybrid event'], 'webinar_latitude': [21.3866249], 'webinar_longitude': [51.6560391], 'day_of_year': [341], 'day_of_week': [4], 'webinar_duration': [240], 'topics': ['To discuss in general the primary Hypothyroidism.To discuss the classification & diagnosis of hypothyroidism and its management in primary health care. To discuss the Management of thyroid disease in pregnancy.'], '_segment_key': ['CM&E']}

In [103]:
webinar_algeria = {'webinar_name_original': ['Merck FERTI Cases'], 'webinar_latitude': [28.0], 'webinar_longitude': [3.0], 'day_of_year': [341], 'day_of_week': [4], 'webinar_duration': [90], 'topics': ['The objective of this program is to explore with an international expert in fertility, complex clinical cases of Medically Assisted Procreation in a unique interactive format as close as possible to reality.'], '_segment_key': ['Fertility']}

In [104]:
df = compile_dataset(hcp_df=hcp_df, webinar=webinar_algeria)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hcp_df.drop_duplicates(subset=['_account_key'], inplace=True)


Distances:  1.87


In [105]:
prediction = get_prediction(df, model_path=model_path)
print('Prediction: ', sum(prediction))

Prediction:  958


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)


Check results

In [2]:
train_data_dtypes = {'_account_key': 'string',
 'account': 'string',
 'account_city': 'string',
 'account_country': 'string',
 'account_country_code': 'string',
 'account_email': 'string',
 'account_inkl_onekey': 'string',
 'account_latitude': 'Float64',
 'account_longitude': 'Float64',
 'account_state': 'Int64',
 'account_status': 'string',
 'account_type': 'string',
 'account_id': 'string',
 'digital_affinity': 'string',
 'digital_segmentation': 'string',
 'gender': 'Int64',
 'hcp_franchise': 'string',
 'hcp_therapeutic_area': 'string',
 'ispersonaccount': 'boolean',
 'onekey_id': 'string',
 'specialty_1': 'string',
 'specialty_2': 'string',
 'veeva_id': 'string',
 'account_brick': 'string',
 'account_territories': 'string',
 'account_territories_per_bu': 'string',
 '_account_key_fdweb': 'string',
 'account_key_original': 'string',
 'account_id_na': 'boolean',
 '_webinar_key': 'Int64',
 '_calendar_key': 'Int64',
 '_product_key': 'Int64',
 '_segment_key': 'string',
 '_country_key': 'string',
 '_channel_key': 'string',
 '_campaign_key': 'string',
 'account_cnt': 'string',
 'webinar_participants_cnt': 'Int64',
 'event_allchannels_cnt': 'string',
 '_fact_allchannels_event': 'Int64',
 '_has_campaign_id': 'Int64',
 '_webinar_has_web_campaign': 'Int64',
 'webinar_cnt': 'Int64',
 '_timeline_key': 'string',
 'attendence_type': 'string',
 '_attended': 'Int64',
 '_fulltimeattended': 'Int64',
 'stream_time': 'Float64',
 'user_rating': 'string',
 'user_benefit_rate': 'Int64',
 'source_system': 'string',
 '_mslteam': 'Int64',
 '_datasource_f_webinars': 'Int64',
 'fact_source_': 'string',
 '_fact_actual': 'Int64',
 '_testaccount': 'Int64',
 '_individualcustomer': 'Int64',
 '_function_key': 'string',
 'fact_function': 'string',
 '_franchiseviewtimeline_key': 'string',
 '_remoteevent': 'Int64',
 '_f2fevent': 'Int64',
 'fact_country': 'string',
 'webinar_fileextension': 'string',
 'webinar_filename': 'string',
 'webinar_duration': 'Int64',
 'webinar_name': 'string',
 'webinar_platform': 'string',
 'account_id_fdweb': 'string',
 'date': 'string',
 'webinar_latitude': 'Float64',
 'webinar_longitude': 'Float64',
 'key_message': 'string'}

In [51]:
train_data_full = pd.read_csv('data\d_account_all.csv', low_memory=False)

In [107]:
train_data_full['Account_Id']

0          LATAM_Veeva|001f400000GROU8AAP
1          LATAM_Veeva|001f400000GQtnFAAT
2          LATAM_Veeva|001f400000GQI0QAAX
3          LATAM_Veeva|001f400000GRBS8AAP
4          LATAM_Veeva|001f400000GROWeAAP
                        ...              
3440920    LATAM_Veeva|001f400000GRvKdAAL
3440921    LATAM_Veeva|001f400000GRvTnAAL
3440922    LATAM_Veeva|001f400000GRvMQAA1
3440923    LATAM_Veeva|001f400000GRvKmAAL
3440924    LATAM_Veeva|001f400000GRvJAAA1
Name: Account_Id, Length: 3440925, dtype: object

In [80]:
train_data_full[train_data_full['Account'].str.lower().str.contains('belhout')]

Unnamed: 0,%Account_Key,Account,Account City,Account Country,Account Email,Account Email2,Account External Id,Account inkl OneKey,Account Latitude,Account Longitude,...,Digital Affinity,Digital Segmentation,Gender,HCP Franchise,HCP Therapeutic Area,IsPersonAccount,OneKey Id,Specialty 1,Specialty 2,Veeva_Id
218150,469634,Nassira Belhout,Bayada,Algeria,,,,Nassira Belhout - WDZM00018881,,,...,,Not Segmented,,CM&E,,True,WDZM00018881,General Medicine,,0012o00002zzB2RAAU
284206,588015,Saliha Belhout,El Achour,Algeria,,,,Saliha Belhout - WDZM00056017,,,...,,Not Segmented,,CM&E,,True,WDZM00056017,Endocrinology,Diabetology,0012o00002zzKWeAAM
1291606,3214374,MOHAMED BELHOUT,SALOUEL,France,,0pfozjcry1@gmail.com,,MOHAMED BELHOUT - WFRM00715991,49.876648,2.255836,...,,,,Neurology & Immunology,,True,WFRM00715991,Hospital Pharmacy,,0012o00002Xrer3AAB
1291607,3214374,MOHAMED BELHOUT,SALOUEL,France,,0pfozjcry1@gmail.com,,MOHAMED BELHOUT - WFRM00715991,49.876648,2.255836,...,,,,Neurology & Immunology,,True,WFRM00715991,Hospital Pharmacy,,0012o00002Xrer3AAB
1291608,3214374,MOHAMED BELHOUT,SALOUEL,France,,0pfozjcry1@gmail.com,,MOHAMED BELHOUT - WFRM00715991,49.876648,2.255836,...,,,,Neurology & Immunology,,True,WFRM00715991,Hospital Pharmacy,,0012o00002Xrer3AAB
1291609,3214374,MOHAMED BELHOUT,SALOUEL,France,mohamed.belhout@chu-amiens.fr,0pfozjcry1@gmail.com,,MOHAMED BELHOUT - WFRM00715991,49.876648,2.255836,...,,,,Neurology & Immunology,,True,WFRM00715991,Hospital Pharmacy,,0012o00002Xrer3AAB
1309847,3362593,Mohamed BELHOUT,,France,mohamed.belhout@chu-amiens.fr,,,Mohamed BELHOUT -,,,...,,,,Multi-Franchise,,True,,,,
1771126,4464440,Cabinet Dr Belhout,Bayada,Algeria,,,,Cabinet Dr Belhout - WDZE00002765,,,...,,Not Segmented,,Multi-Franchise,,False,WDZE00002765,,,0012o00002zyfiIAAQ
1893161,4786093,Abir Belhout,Ain Azel,Algeria,,,,Abir Belhout - WDZM00085291,,,...,,Not Segmented,,CM&E,,True,WDZM00085291,General Medicine,,0012o000031IrUwAAK
1948678,4878372,AICHA BELHOUT,,France,,,,AICHA BELHOUT - WFRR02453589,,,...,,,,Multi-Franchise,,True,WFRR02453589,Nurse,,0012o00002a60GdAAI


In [52]:
train_data_full['Account Country'].value_counts()

France          1258638
Germany         1098306
Brazil           921087
Algeria           86753
Saudi Arabia      76141
Name: Account Country, dtype: int64

In [4]:
found = pd.read_csv('Prediction_Merck FERTI_Cases.csv')

In [168]:
df[prediction.astype(bool)]['account_country'].value_counts()

Algeria    946
Germany     12
Name: account_country, dtype: int64

In [None]:
df[prediction.astype(bool)]

In [51]:
hcp_df['account_country'].value_counts()

Algeria         176541
Brazil           37462
Germany          31389
France           10037
Saudi Arabia      4493
Name: account_country, dtype: int64

In [58]:
df[prediction.astype(bool)].drop_duplicates(subset=['_account_key']).to_csv('Prediction_Management_of_Hypothyroidism.csv', index=False)

In [5]:
found = train_data_full[train_data_full['_account_key'].isin(found['_account_key'])].drop_duplicates(subset=['_account_key'])

In [8]:
found[found['_webinar_key'].notnull()]['account_country'].value_counts()

Algeria    652
Name: account_country, dtype: Int64

In [186]:
set(['a', 'b']).intersection(set(['a', 'b', 'c']))

{'a', 'b'}

In [181]:
isinstance(model, sklearn.ensemble.RandomForestClassifier)

False

In [11]:
found[found['account'] == 'Nassima Djerroud']

Unnamed: 0,_account_key,account,account_city,account_country,account_country_code,account_email,account_inkl_onekey,account_latitude,account_longitude,account_state,...,webinar_fileextension,webinar_filename,webinar_duration,webinar_name,webinar_platform,account_id_fdweb,date,webinar_latitude,webinar_longitude,key_message


In [15]:
'Hind' in found['account'].str.lower()

False

In [120]:
with open('participatns.txt') as p:
    participants = p.read()

In [121]:
participants = participants.split('\n')

In [122]:
participants = set(map(str.strip, participants))

In [91]:
found['account'] = found['account'].str.lower()

In [124]:
found['account'] = found['account'].str.lower()
f = 0
for p in participants:
    match = found[found['account'].str.contains(p.lower())]
    print(f'Name: {p}')
    print(found[found['account'].str.contains(p.lower())].shape)
    if match.shape[0] > 0:
        f += 1
print('Sum matches: ', f)

Name: 
(655, 73)
Name: Messali
(1, 73)
Name: Taoutaou
(1, 73)
Name: Taieb chehaima
(0, 73)
Name: Derradji
(1, 73)
Name: Kermad
(0, 73)
Name: Hadjeres
(1, 73)
Name: Ouldhamouda
(0, 73)
Name: Nedjwa
(0, 73)
Name: Beldjbel
(0, 73)
Name: Djerroud
(1, 73)
Name: zerrad chahrazed
(0, 73)
Name: Degaichia
(0, 73)
Name: Menhour
(0, 73)
Name: Bensalem
(1, 73)
Name: Mekhou
(0, 73)
Name: S
(261, 73)
Name: marrak
(0, 73)
Name: Negadi
(0, 73)
Name: chebhi
(1, 73)
Name: Meftah
(1, 73)
Name: radia
(8, 73)
Name: NEDJARI
(1, 73)
Name: Tahir
(1, 73)
Name: Kouar
(0, 73)
Name: Ghozlane
(0, 73)
Name: Selselet Attou
(1, 73)
Name: Marref
(0, 73)
Name: MEKAOUCHE
(0, 73)
Name: Alkhalili
(0, 73)
Name: Sahraoui
(1, 73)
Name: Meliani
(1, 73)
Name: Habes
(0, 73)
Name: Bekhalfa
(0, 73)
Sum matches:  16


In [125]:
found[found['account'].str.contains('ghoul')]

Unnamed: 0,_account_key,account,account_city,account_country,account_country_code,account_email,account_inkl_onekey,account_latitude,account_longitude,account_state,...,webinar_fileextension,webinar_filename,webinar_duration,webinar_name,webinar_platform,account_id_fdweb,date,webinar_latitude,webinar_longitude,key_message
380105,EMEA_Veeva|0012o00002zzMMqAAM,lila belghoula,Ain Tedeles,Algeria,DZ,belghoula123@gmail.com,Lila Belghoula - WDZM00062177,,,,...,xlsx,458757.xlsx,60,l’infertilité masculine,Intrado,,2020-10-02,51.163818,10.447831,
392791,EMEA_Veeva|0012o00002zzOFgAAM,ahlem ghoul,Bordj Bou Arreridj,Algeria,DZ,ahlemgyn@gmail.com,Ahlem Ghoul - WDZM00067993,,,,...,csv,97255591688_2021-11-05-09-45-37.csv,182,"The past, present and future in luteal phase m...",Zoom,,2021-11-05,51.163818,10.447831,EADV_Gonal_Trust Matter DZ_


In [126]:
len(participants)

34

In [50]:
train_data_full['Account Country'].value_counts()['DZ']

8

In [68]:
first = train_data_full['Account'].apply(lambda x: x.lower().split()[0] if isinstance(x, str) else str(x))

In [69]:
sorted(first)

['"die',
 '"die',
 '"mitten',
 '"pflege',
 '"wir',
 '#dedoc°',
 '&',
 "'",
 "''drogaria",
 "''farmacia",
 "''farmacia",
 "'dose",
 "'droga",
 "'nos-nucleo",
 '(coracao',
 '(drogaria',
 '(fias)',
 '(g.e.d.i.i.b.)',
 '(unidade',
 '(usf)',
 '(usf)',
 '(・_・)',
 '*',
 '*',
 '+49',
 '-activ-physiotherapie',
 '-melanie',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '002',
 '01',
 '01',
 '01bou.pharmarket.com',
 '02',
 '02',
 '02',
 '02',
 '02lao.pharmarket.com',
 '02squ.pharmarket.com',
 '03',
 '04',
 '04',
 '05',
 '06',
 '07',
 '07.tou.pharmarket.com',
 '08',
 '09',
 '09',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1-apo',
 '1.',
 '1.',
 '1.',
 '1.',
 '10',
 '10',
 '10',
 '10',
 '10',
 '10',
 '10',
 '10',
 '100',
 '100',
 '100',
 '100',
 '100',
 '100',
 '100',
 '1000',
 '1000',
 '1000fa

In [131]:
import dask.dataframe as dd
import dask

In [129]:
hcp_df = dd.read_csv('hcp_segment_37webs.csv')

In [130]:
type(hcp_df)

dask.dataframe.core.DataFrame

In [134]:
isinstance(hcp_df, dd.DataFrame)

True

In [170]:
first = dask.delayed(hcp_df.assign(a = hcp_df.apply(lambda row: row['account_latitude'] + row['account_longitude'], axis=1, meta=('a', 'float'))))

In [171]:
first.compute().columns

Index(['_account_key', 'account_country', 'account_latitude',
       'account_longitude', 'account_status', 'account_id', 'label_x',
       'label_y', 'num_webs', 'num_key_messages', 'main_interests',
       'similar_interests', 'Conventionalists', 'Digitally Engaged',
       'Engagers', 'Low Engagers', 'Not Segmented', 'CM&E', 'Fertility',
       'Multi-Franchise', 'Neurology & Immunology', 'Oncology', 'a'],
      dtype='object')