In [1]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
import pickle as pk
import numpy as np
from pathlib import Path
base_dir = os.getenv('BASEDIR')

from tqdm import tqdm
tqdm.pandas()

# from lightgbm import LGBMClassifier, early_stopping
from flaml.default import LGBMClassifier
from itertools import chain, combinations

In [2]:
import pandas as pd
import os 
import re
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, brier_score_loss, precision_score, recall_score
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek

import seaborn as sns
import matplotlib.pyplot as plt

import os
from collections import defaultdict

In [3]:
lr_groundtruths =['HASHTAG','URL_LR','POLITICIAN_LR','POLITICIAN_1H_LR', 'PARTY_FOLLOWER_LR']

In [4]:
features = ['use', 'ht', 'rt']
# features = ['use']

In [5]:
def all_subsets(ss):
    return chain(*map(lambda x: combinations(ss, x), range(1, len(ss)+1)))

In [6]:
performance_validation = {score_type : {gt: {} for gt in lr_groundtruths} for score_type in ['f1', 'roc_auc', 'overlap_f1', 'calibrated_f1', 'brier', 'calibrated_precision', 'calibrated_recall']}

In [49]:
dataset='qanda'
granularity = '_per_user'
for feature_set in tqdm(all_subsets(features)):

    #Check if the features exist for this dataset + granularity + feature combination
    feature_paths = [os.path.join(base_dir,'data','03_processed',dataset,'features', dataset+'_'+feature+'_'+granularity+'.pk') for feature in feature_set]
    if not all([os.path.exists(fp) for fp in feature_paths]):
        continue
    #Check if the ground truth exists
    for gt in lr_groundtruths:
        # print(dataset,granularity,feature_set,gt)
        gt_path = os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_'+gt+'_'+granularity+'.pk')
        if not os.path.exists(gt_path):
            print(gt_path, ": Does not exist")
            continue
            
        manual_val_path = os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_'+'MANUAL_VALIDATION_LR'+'_'+granularity+'.pk')
        if not os.path.exists(manual_val_path):
            print(manual_val_path, ": Does not exist")
            continue

        def load_vector(fp):
            with open(fp, 'rb') as rf:
                vec = pk.load(rf)
            return vec
        
        X_orig = np.asarray(np.hstack([load_vector(fp) for fp in feature_paths]))
        
        # The manual validation set has no 0 class items
        y_orig = load_vector(gt_path).values
        mask_gt = y_orig > 0
        
        y_manual_val= load_vector(manual_val_path).values
        mask_manual_val = y_manual_val > 0
        
        mask_train = mask_gt & ~mask_manual_val
        mask_test = mask_manual_val
        
        mask = mask_gt & mask_manual_val 
        # print("Training Size:", sum(mask_train))
        # print("Overlap:", sum(mask))
        # print("Overlap F1:", f1_score(y_manual_val[mask] -1, y_orig[mask] -1))
        performance_validation['overlap_f1'][gt][feature_set] = f1_score(y_manual_val[mask] -1, y_orig[mask] -1)
        assert any(mask_train) and any(mask_test)
        
        ## Rebalancing the dataset
#         y_imbalanced = y_orig[mask_train] -1
#         X_imbalanced = X_orig[mask_train]
        
#         # balancer = SMOTE()
#         # balancer = ADASYN()
#         # balancer = SMOTEENN(random_state=123)
#         balancer = SMOTETomek(random_state=123)
#         X_train, y_train = balancer.fit_resample(X_imbalanced, y_imbalanced)
        
        # Standard split (effectively no validation)
        y_train = y_orig[mask_train] -1
        X_train = X_orig[mask_train]
        
        y_test = y_manual_val[mask_test] -1
        X_test = X_orig[mask_test]
        
        # y_val = y_test
        # X_val = X_test
        
        #Using test set validation
        X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=123)
        
        #Using train set validation
        # X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)
        
        # et = LGBMClassifier(is_unbalance=True,feature_fraction=0.8, n_jobs=-1, verbose=-1, seed=123)
        et = LGBMClassifier(is_unbalance=True,n_estimators=200, n_jobs=-1, verbose=-1, seed=41)
        # et = MLPClassifier(random_state=123,  max_iter=1000)
        # et = LGBMClassifier(n_jobs=-1, verbose=-1, seed=123)
        # hyperparams, estimator_name, X_transformed, y_transformed = et.suggest_hyperparams(X_train, y_train)
        # print(feature_set)
        # print(hyperparams)
        # Using standard splot
        et.fit(X_train, y_train)
        
        #Using explicit validation set
        # et.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[early_stopping(5)])
        preds = et.predict(X_test)
        f1 = f1_score(y_test,preds, average="macro")
        
        probs = et.predict_proba(X_test)
        
        #find a better threhold with the val set
        probs_val = et.predict_proba(X_val)
#         print("Example f1",f1_score(y_test,probs[:,1] > 0.1))
        
        precision, recall, thresholds = precision_recall_curve(y_val, probs_val[:,1])
        # f1s = 2 * (precision * recall) / (precision + recall)
        # print(f1s)
        f1s = [f1_score(y_val, probs_val[:,1] > t, average='macro') for t in thresholds]
        precisions = [precision_score(y_val, probs_val[:,1] > t, average='macro', zero_division=0) for t in thresholds]
        recalls = [recall_score(y_val, probs_val[:,1] > t, average='macro') for t in thresholds]
        # print(thresholds)
        # print(f1s)
        # break
        best_threshold_index = np.argmax(f1s)
        performance_validation['calibrated_precision'][gt][feature_set] = precisions[best_threshold_index]
        performance_validation['calibrated_recall'][gt][feature_set] = recalls[best_threshold_index]

        # best_threshold = thresholds[best_threshold_index]

#         fig = plt.figure()
#         sns.scatterplot(x=thresholds, y=f1s[:-1])
#         fig.show()
        
        performance_validation['calibrated_f1'][gt][feature_set] = np.nanmax(f1s)
        
        roc_auc = roc_auc_score(y_test,probs[:,1])
        brier = brier_score_loss(y_test,probs[:,1])
        
        performance_validation['f1'][gt][feature_set] = f1
        performance_validation['roc_auc'][gt][feature_set] = roc_auc
        performance_validation['brier'][gt][feature_set] = brier

7it [03:56, 33.82s/it]


In [50]:
performance_manual_val_f1 = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['f1'].items()})
performance_manual_val_cal_f1 = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['calibrated_f1'].items()})

performance_manual_val_cal_precision = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['calibrated_precision'].items()})
performance_manual_val_cal_recall = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['calibrated_recall'].items()})

performance_manual_val_roc_auc = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['roc_auc'].items()})
performance_manual_val_brier = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['brier'].items()})

In [1]:
performance_manual_val_cal_f1

NameError: name 'performance_manual_val_cal_f1' is not defined

In [51]:
performance_manual_val_cal_precision

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.886859,0.903077,0.743333,0.743622,0.916569
ht,0.804359,0.79899,0.776515,0.758718,0.804359
rt,0.78842,0.875253,0.72619,0.731004,0.782621
use+ht,0.973684,0.926715,0.632963,0.916143,0.902985
use+rt,0.909091,0.903077,0.733163,0.751524,0.901404
ht+rt,0.894062,0.919519,0.303371,0.776974,0.897059
use+ht+rt,0.967112,0.914603,0.632963,0.903586,0.902985


In [52]:
performance_manual_val_cal_recall

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.819312,0.915873,0.751058,0.752646,0.876455
ht,0.814021,0.813228,0.789683,0.766931,0.814021
rt,0.782011,0.893122,0.670899,0.680159,0.772751
use+ht,0.957143,0.934392,0.639153,0.920106,0.814286
use+rt,0.828571,0.915873,0.741799,0.761905,0.847884
ht+rt,0.833598,0.915079,0.5,0.763492,0.8
use+ht+rt,0.962169,0.925132,0.639153,0.910847,0.814286


In [53]:
performance_manual_val_cal_f1

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.835209,0.907484,0.745582,0.729483,0.889441
ht,0.786085,0.795663,0.772959,0.74105,0.786085
rt,0.741442,0.874824,0.596764,0.609649,0.73003
use+ht,0.964098,0.930031,0.626193,0.917994,0.832246
use+rt,0.846552,0.907484,0.735016,0.740393,0.862828
ht+rt,0.849153,0.917165,0.377622,0.718533,0.817623
use+ht+rt,0.964499,0.918721,0.626193,0.906709,0.832246


In [54]:
performance_manual_val_roc_auc

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.873626,0.955259,0.699372,0.79147,0.869963
ht,0.876766,0.847985,0.847462,0.81214,0.87572
rt,0.840659,0.889325,0.721873,0.751701,0.8663
use+ht,0.955259,0.943223,0.601779,0.858974,0.869702
use+rt,0.877813,0.959445,0.75641,0.784668,0.844846
ht+rt,0.90293,0.937991,0.5,0.798796,0.914443
use+ht+rt,0.958922,0.928048,0.601779,0.857928,0.882261


In [45]:
performance_manual_val_cal_f1

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.835209,0.90582,0.74742,0.741442,0.862828
ht,0.786085,0.795663,0.772959,0.74105,0.786085
rt,0.741442,0.863776,0.596764,0.609649,0.73003
use+ht,0.874294,0.930613,0.647232,0.90582,0.832246
use+rt,0.835209,0.917165,0.756164,0.741442,0.862828
ht+rt,0.849153,0.917165,0.377622,0.718533,0.817623
use+ht+rt,0.95184,0.930031,0.647232,0.896311,0.846552


In [46]:
performance_manual_val_f1

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.792654,0.625987,0.357664,0.754223,0.734488
ht,0.817805,0.829347,0.357664,0.73779,0.829347
rt,0.550058,0.352941,0.357664,0.357664,0.529097
use+ht,0.357664,0.357664,0.357664,0.357664,0.357664
use+rt,0.675789,0.599686,0.357664,0.766826,0.765136
ht+rt,0.357664,0.357664,0.357664,0.357664,0.357664
use+ht+rt,0.357664,0.357664,0.357664,0.357664,0.357664


In [47]:
print(performance_manual_val_roc_auc[['HASHTAG', 'URL_LR', 'POLITICIAN_1H_LR', 'PARTY_FOLLOWER_LR']].to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &   HASHTAG &    URL\_LR &  POLITICIAN\_1H\_LR &  PARTY\_FOLLOWER\_LR \\
\midrule
use       &  0.877813 &  0.939560 &          0.789116 &           0.849555 \\
ht        &  0.876243 &  0.848509 &          0.812140 &           0.875720 \\
rt        &  0.840136 &  0.879383 &          0.751701 &           0.844061 \\
use+ht    &  0.923339 &  0.935897 &          0.868394 &           0.868132 \\
use+rt    &  0.875720 &  0.952643 &          0.790424 &           0.844846 \\
ht+rt     &  0.904500 &  0.937991 &          0.798796 &           0.914966 \\
use+ht+rt &  0.955782 &  0.928571 &          0.865777 &           0.869702 \\
\bottomrule
\end{tabular}



In [13]:
performance_manual_val_f1.to_csv(os.path.join(base_dir, 'data', '04_results', 'ablation_qanda_manual_val_f1.csv'))
performance_manual_val_cal_f1.to_csv(os.path.join(base_dir, 'data', '04_results', 'ablation_qanda_manual_val_cal_f1.csv'))
performance_manual_val_roc_auc.to_csv(os.path.join(base_dir, 'data', '04_results', 'ablation_qanda_manual_val_auc.csv'))

In [31]:
performance_manual_val_cal_f1

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.835209,0.906709,0.74742,0.740393,0.889441
ht,0.786085,0.795663,0.772959,0.75127,0.786085
rt,0.741442,0.863776,0.596764,0.609649,0.73003
use+ht,0.964098,0.917994,0.613423,0.895499,0.832246
use+rt,0.835209,0.906709,0.74742,0.74105,0.776349
ht+rt,0.849153,0.908153,0.377622,0.718533,0.817623
use+ht+rt,0.95184,0.918721,0.613423,0.906709,0.846552


In [32]:
performance_manual_val_f1

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.686337,0.625987,0.357664,0.743508,0.843352
ht,0.817805,0.829347,0.357664,0.73779,0.829347
rt,0.550058,0.352941,0.357664,0.357664,0.529097
use+ht,0.357664,0.357664,0.357664,0.357664,0.357664
use+rt,0.675789,0.617866,0.357664,0.754223,0.684001
ht+rt,0.357664,0.357664,0.357664,0.357664,0.357664
use+ht+rt,0.357664,0.357664,0.357664,0.357664,0.357664


In [33]:
performance_manual_val_roc_auc

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.880952,0.946363,0.836473,0.788069,0.86787
ht,0.87258,0.848509,0.847462,0.81214,0.87572
rt,0.840136,0.879383,0.721873,0.751701,0.844061
use+ht,0.94898,0.939037,0.58922,0.869963,0.878598
use+rt,0.879906,0.952643,0.779435,0.784929,0.820774
ht+rt,0.903977,0.937467,0.5,0.798796,0.914443
use+ht+rt,0.949503,0.929095,0.58922,0.853741,0.874935


In [34]:
performance_manual_val_brier

Unnamed: 0,HASHTAG,URL_LR,POLITICIAN_LR,POLITICIAN_1H_LR,PARTY_FOLLOWER_LR
use,0.142972,0.182491,0.250984,0.18593,0.199056
ht,0.139793,0.202232,0.249526,0.163333,0.168314
rt,0.167572,0.211239,0.263586,0.197026,0.187466
use+ht,0.26982,0.329024,0.255441,0.251424,0.24463
use+rt,0.142088,0.184401,0.250448,0.188878,0.198779
ht+rt,0.257017,0.268994,0.264417,0.257647,0.239618
use+ht+rt,0.266952,0.328454,0.255441,0.258185,0.245353


In [1]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
import pickle as pk
import numpy as np
from pathlib import Path
base_dir = os.getenv('BASEDIR')

from tqdm import tqdm
tqdm.pandas()

# from lightgbm import LGBMClassifier, early_stopping
from flaml.default import LGBMClassifier
from itertools import chain, combinations

In [8]:
import pandas as pd
import os 
import re
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, brier_score_loss,  precision_score, recall_score
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek

import seaborn as sns
import matplotlib.pyplot as plt

import os
from collections import defaultdict

In [9]:
fr_groundtruths =['USER_FR', 'URLa_FR', 'URLb_FR']

In [10]:
features = ['use', 'ht', 'rt']
# features = ['use']

In [11]:
def all_subsets(ss):
    return chain(*map(lambda x: combinations(ss, x), range(1, len(ss)+1)))

In [12]:
performance_validation = {score_type : {gt: {} for gt in fr_groundtruths} for score_type in ['f1', 'roc_auc', 'overlap_f1', 'calibrated_f1', 'brier','calibrated_precision', 'calibrated_recall']}

In [13]:
dataset='qanda'
granularity = '_per_user'
for feature_set in tqdm(all_subsets(features)):
    print(feature_set)
    #Check if the features exist for this dataset + granularity + feature combination
    feature_paths = [os.path.join(base_dir,'data','03_processed',dataset,'features', dataset+'_'+feature+'_'+granularity+'.pk') for feature in feature_set]
    if not all([os.path.exists(fp) for fp in feature_paths]):
        continue
    #Check if the ground truth exists
    for gt in fr_groundtruths:
        print(dataset,granularity,feature_set,gt)
        gt_path = os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_'+gt+'_'+granularity+'.pk')
        if not os.path.exists(gt_path):
            print(gt_path, ": Does not exist")
            continue
            
        manual_val_path = os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_'+'USER_FR'+'_'+granularity+'.pk')
        if not os.path.exists(manual_val_path):
            print(manual_val_path, ": Does not exist")
            continue

        def load_vector(fp):
            with open(fp, 'rb') as rf:
                vec = pk.load(rf)
            return vec
        
        X_orig = np.asarray(np.hstack([load_vector(fp) for fp in feature_paths]))
        
        # The manual validation set has no 0 class items
        y_orig = load_vector(gt_path).values
        # print(y_orig)
        mask_gt = y_orig >= 0
        
        y_manual_val= load_vector(manual_val_path).values
        # print(y_manual_val)
        mask_manual_val = y_manual_val >= 0
        
        mask_train = mask_gt 
        # & ~mask_manual_val
        mask_test = mask_manual_val
        
        # mask = mask_gt & mask_manual_val 
        mask = mask_train | mask_test
        # print("Training Size:", sum(mask_train))
        # print("Overlap:", sum(mask))
        # print("Overlap F1:", f1_score(y_manual_val[mask] -1, y_orig[mask] -1))
        # performance_validation['overlap_f1'][gt][feature_set] = f1_score(y_manual_val[mask], y_orig[mask])
        # assert any(mask_train) and any(mask_test)
        
        ## Rebalancing the dataset
#         y_imbalanced = y_orig[mask_train] -1
#         X_imbalanced = X_orig[mask_train]
        
#         # balancer = SMOTE()
#         # balancer = ADASYN()
#         # balancer = SMOTEENN(random_state=123)
#         balancer = SMOTETomek(random_state=123)
#         X_train, y_train = balancer.fit_resample(X_imbalanced, y_imbalanced)
        
        # Standard split (effectively no validation)
        y_train = y_orig[mask_train]
        X_train = X_orig[mask_train]
        
        y_test = y_manual_val[mask_test]
        X_test = X_orig[mask_test]
        
        # y_val = y_test
        # X_val = X_test
        
        #Using test set validation
        X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=123)
        
        #Using train set validation
        # X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)
        
        # et = LGBMClassifier(is_unbalance=True,feature_fraction=0.8, n_jobs=-1, verbose=-1, seed=123)
        et = LGBMClassifier(is_unbalance=True, n_jobs=-1, verbose=-1, seed=123)
        # et = LGBMClassifier(is_unbalance=True,n_estimators=200, n_jobs=-1, verbose=-1, seed=123)
        # et = MLPClassifier(random_state=123,  max_iter=1000)
        # et = LGBMClassifier(n_jobs=-1, verbose=-1, seed=123)
        # hyperparams, estimator_name, X_transformed, y_transformed = et.suggest_hyperparams(X_train, y_train)
        # print(feature_set)
        # print(hyperparams)
        # Using standard splot
        print("test a")
        et.fit(X_train, y_train)
        print("test b")
        #Using explicit validation set
        # et.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[early_stopping(5)])
        preds = et.predict(X_test)
        f1 = f1_score(y_test,preds, average="macro")
        print("test c")
        probs = et.predict_proba(X_test)
        print("test d")
        #find a better threhold with the val set
        probs_val = et.predict_proba(X_val)
#         print("Example f1",f1_score(y_test,probs[:,1] > 0.1))
        
        precision, recall, thresholds = precision_recall_curve(y_val, probs_val[:,1])
        # f1s = 2 * (precision * recall) / (precision + recall)
        # print(f1s)
        f1s = [f1_score(y_val, probs_val[:,1] > t, average='macro') for t in thresholds]
        precisions = [precision_score(y_val, probs_val[:,1] > t, average='macro', zero_division=0) for t in thresholds]
        recalls = [recall_score(y_val, probs_val[:,1] > t, average='macro') for t in thresholds]
        # print(thresholds)
        # print(f1s)
        # break
        best_threshold_index = np.argmax(f1s)
        performance_validation['calibrated_precision'][gt][feature_set] = precisions[best_threshold_index]
        performance_validation['calibrated_recall'][gt][feature_set] = recalls[best_threshold_index]

#         fig = plt.figure()
#         sns.scatterplot(x=thresholds, y=f1s[:-1])
#         fig.show()
        print("test f")
        # performance_validation['calibrated_f1'][gt][feature_set] = np.nanmax(f1s)
        
        roc_auc = roc_auc_score(y_test,probs[:,1])
        # brier = brier_score_loss(y_test,probs[:,1])
        
        # performance_validation['f1'][gt][feature_set] = f1
        performance_validation['roc_auc'][gt][feature_set] = roc_auc
        # performance_validation['brier'][gt][feature_set] = brier

0it [00:00, ?it/s]

('use',)
qanda _per_user ('use',) USER_FR
test a
test b
test c
test d
test f
qanda _per_user ('use',) URLa_FR
test a
test b
test c
test d
test f
qanda _per_user ('use',) URLb_FR
test a
test b
test c
test d


1it [1:10:14, 4214.42s/it]

test f
('ht',)
qanda _per_user ('ht',) USER_FR
test a
test b
test c
test d
test f
qanda _per_user ('ht',) URLa_FR
test a
test b
test c
test d
test f
qanda _per_user ('ht',) URLb_FR
test a
test b
test c
test d


2it [1:28:11, 2368.81s/it]

test f
('rt',)
qanda _per_user ('rt',) USER_FR
test a
test b
test c
test d
test f
qanda _per_user ('rt',) URLa_FR
test a
test b
test c
test d
test f
qanda _per_user ('rt',) URLb_FR
test a
test b
test c
test d


3it [1:38:30, 1569.96s/it]

test f
('use', 'ht')
qanda _per_user ('use', 'ht') USER_FR
test a
test b
test c
test d
test f
qanda _per_user ('use', 'ht') URLa_FR
test a
test b
test c
test d
test f
qanda _per_user ('use', 'ht') URLb_FR
test a
test b
test c
test d


4it [2:53:48, 2733.72s/it]

test f
('use', 'rt')
qanda _per_user ('use', 'rt') USER_FR
test a
test b
test c
test d
test f
qanda _per_user ('use', 'rt') URLa_FR
test a
test b
test c
test d
test f
qanda _per_user ('use', 'rt') URLb_FR
test a
test b
test c
test d


5it [4:07:39, 3345.93s/it]

test f
('ht', 'rt')
qanda _per_user ('ht', 'rt') USER_FR
test a
test b
test c
test d
test f
qanda _per_user ('ht', 'rt') URLa_FR
test a
test b
test c
test d
test f
qanda _per_user ('ht', 'rt') URLb_FR
test a
test b
test c
test d


6it [4:28:47, 2639.25s/it]

test f
('use', 'ht', 'rt')
qanda _per_user ('use', 'ht', 'rt') USER_FR
test a
test b
test c
test d
test f
qanda _per_user ('use', 'ht', 'rt') URLa_FR
test a
test b
test c
test d
test f
qanda _per_user ('use', 'ht', 'rt') URLb_FR
test a
test b
test c
test d


7it [5:45:13, 2959.04s/it]

test f





In [15]:
performance_manual_val_f1 = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['f1'].items()})
performance_manual_val_cal_f1 = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['calibrated_f1'].items()})

performance_manual_val_cal_precision = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['calibrated_precision'].items()})
performance_manual_val_cal_recall = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['calibrated_recall'].items()})

performance_manual_val_roc_auc = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['roc_auc'].items()})
performance_manual_val_brier = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['brier'].items()})

In [16]:
performance_manual_val_cal_precision

Unnamed: 0,USER_FR,URLa_FR,URLb_FR
use,0.989568,0.520882,0.525809
ht,0.665994,0.523484,0.51284
rt,0.569028,0.496672,0.542805
use+ht,0.995438,0.526271,0.535102
use+rt,0.99241,0.51881,0.525791
ht+rt,0.929319,0.506209,0.534479
use+ht+rt,0.993968,0.522681,0.532544


In [17]:
performance_manual_val_cal_recall

Unnamed: 0,USER_FR,URLa_FR,URLb_FR
use,0.983897,0.555555,0.581689
ht,0.640972,0.503856,0.522091
rt,0.592362,0.5,0.544911
use+ht,0.985393,0.570191,0.577643
use+rt,0.981001,0.555938,0.579096
ht+rt,0.692215,0.501909,0.510705
use+ht+rt,0.985384,0.590538,0.579758


In [18]:
performance_manual_val_cal_f1

Unnamed: 0,USER_FR,URLa_FR,URLb_FR


In [19]:
performance_manual_val_roc_auc

Unnamed: 0,USER_FR,URLa_FR,URLb_FR
use,0.999817,0.690548,0.773385
ht,0.855458,0.559088,0.632652
rt,0.73033,0.538484,0.668365
use+ht,0.999962,0.714673,0.784933
use+rt,0.99979,0.665831,0.762458
ht+rt,0.888194,0.570337,0.632258
use+ht+rt,0.999962,0.713392,0.784844


In [72]:
performance_manual_val_f1

Unnamed: 0,USER_FR,URLa_FR,URLb_FR


In [73]:
performance_manual_val_cal_f1

Unnamed: 0,USER_FR,URLa_FR,URLb_FR


In [74]:
print(performance_manual_val_roc_auc[['URLa_FR', 'URLb_FR']].to_latex())

\begin{tabular}{lrr}
\toprule
{} &   URLa\_FR &   URLb\_FR \\
\midrule
use       &  0.690548 &  0.773385 \\
ht        &  0.559088 &  0.632652 \\
rt        &  0.538484 &  0.668365 \\
use+ht    &  0.714673 &  0.784933 \\
use+rt    &  0.665831 &  0.762458 \\
ht+rt     &  0.570337 &  0.632258 \\
use+ht+rt &  0.713392 &  0.784844 \\
\bottomrule
\end{tabular}



In [54]:
dataset='qanda'
granularity = '_per_user'
for feature_set in tqdm(all_subsets(features)):
    print(feature_set)
    #Check if the features exist for this dataset + granularity + feature combination
    feature_paths = [os.path.join(base_dir,'data','03_processed',dataset,'features', dataset+'_'+feature+'_'+granularity+'.pk') for feature in feature_set]
    if not all([os.path.exists(fp) for fp in feature_paths]):
        continue
    #Check if the ground truth exists
    for gt in fr_groundtruths:
        print(dataset,granularity,feature_set,gt)
        gt_path = os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_'+gt+'_'+granularity+'.pk')
        if not os.path.exists(gt_path):
            print(gt_path, ": Does not exist")
            continue
            
        manual_val_path = os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_'+'USER_FR'+'_'+granularity+'.pk')
        if not os.path.exists(manual_val_path):
            print(manual_val_path, ": Does not exist")
            continue

        def load_vector(fp):
            with open(fp, 'rb') as rf:
                vec = pk.load(rf)
            return vec
        
        X_orig = np.asarray(np.hstack([load_vector(fp) for fp in feature_paths]))
        
        # The manual validation set has no 0 class items
        y_orig = load_vector(gt_path).values
        mask_gt = y_orig >= 0
        
        y_manual_val= load_vector(manual_val_path).values
        mask_manual_val = y_manual_val >= 0
        
        mask_train = mask_gt 
        mask_test = mask_manual_val
        
        mask = mask_train & mask_test
        assert any(mask_train) and any(mask_test)
        
        # Standard split (effectively no validation)
        y_train = y_orig[mask]
        X_train = X_orig[mask]
        
        y_test = y_manual_val[mask]
        X_test = X_orig[mask]
        
        et = LGBMClassifier(is_unbalance=True, n_jobs=-1, verbose=-1, seed=123)
        skf = StratifiedKFold(n_splits=5, shuffle=True)
        preds = cross_val_predict(et, X_train, y_train, cv=skf, method='predict_proba')
        roc_auc = roc_auc_score(y_test,preds[:,1])
        performance_validation['roc_auc'][gt][feature_set] = roc_auc

0it [00:00, ?it/s]

('use',)
qanda _per_user ('use',) USER_FR
qanda _per_user ('use',) URLa_FR
qanda _per_user ('use',) URLb_FR


1it [01:00, 61.00s/it]

('ht',)
qanda _per_user ('ht',) USER_FR
qanda _per_user ('ht',) URLa_FR
qanda _per_user ('ht',) URLb_FR


2it [02:16, 69.58s/it]

('rt',)
qanda _per_user ('rt',) USER_FR
qanda _per_user ('rt',) URLa_FR
qanda _per_user ('rt',) URLb_FR


3it [02:46, 51.29s/it]

('use', 'ht')
qanda _per_user ('use', 'ht') USER_FR
qanda _per_user ('use', 'ht') URLa_FR
qanda _per_user ('use', 'ht') URLb_FR


4it [04:47, 78.98s/it]

('use', 'rt')
qanda _per_user ('use', 'rt') USER_FR
qanda _per_user ('use', 'rt') URLa_FR
qanda _per_user ('use', 'rt') URLb_FR


5it [06:46, 93.42s/it]

('ht', 'rt')
qanda _per_user ('ht', 'rt') USER_FR
qanda _per_user ('ht', 'rt') URLa_FR
qanda _per_user ('ht', 'rt') URLb_FR


6it [08:32, 97.54s/it]

('use', 'ht', 'rt')
qanda _per_user ('use', 'ht', 'rt') USER_FR
qanda _per_user ('use', 'ht', 'rt') URLa_FR
qanda _per_user ('use', 'ht', 'rt') URLb_FR


7it [11:12, 96.09s/it] 


In [55]:
performance_manual_val_roc_auc = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['roc_auc'].items()})

In [56]:
performance_manual_val_roc_auc

Unnamed: 0,USER_FR,URLa_FR,URLb_FR
use,0.623023,0.557731,0.578264
ht,0.612537,0.513966,0.458663
rt,0.677869,0.500317,0.517129
use+ht,0.574144,0.534339,0.598519
use+rt,0.644884,0.551523,0.584612
ht+rt,0.67265,0.573962,0.467205
use+ht+rt,0.613219,0.613351,0.580463
