In [53]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
import pickle as pk
import numpy as np
from pathlib import Path
base_dir = os.getenv('BASEDIR')

from tqdm import tqdm
tqdm.pandas()

from lightgbm import LGBMClassifier
from itertools import chain, combinations

In [54]:
import pandas as pd
import os 
import re
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.datasets import make_classification

import os
from collections import defaultdict

In [55]:
lr_groundtruths =['HASHTAG','URL_LR','POLITICIAN_LR','POLITICIAN_1H_LR', 'PARTY_FOLLOWER_LR']
features = ['use', 'ht', 'rt']

In [56]:
def all_subsets(ss):
    return chain(*map(lambda x: combinations(ss, x), range(1, len(ss)+1)))

In [57]:
performance_validation = {score_type : {gt: {} for gt in lr_groundtruths} for score_type in ['f1', 'roc_auc']}

In [None]:
dataset='qanda'
granularity = '_per_user'
for feature_set in all_subsets(features):

    #Check if the features exist for this dataset + granularity + feature combination
    feature_paths = [os.path.join(base_dir,'data','03_processed',dataset,'features', dataset+'_'+feature+'_'+granularity+'.pk') for feature in feature_set]
    if not all([os.path.exists(fp) for fp in feature_paths]):
        continue
    #Check if the ground truth exists
    for gt in lr_groundtruths:
        print(dataset,granularity,feature_set,gt)
        gt_path = os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_'+gt+'_'+granularity+'.pk')
        if not os.path.exists(gt_path):
            print(gt_path, ": Does not exist")
            continue
            
        manual_val_path = os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_'+'MANUAL_VALIDATION_LR'+'_'+granularity+'.pk')
        if not os.path.exists(manual_val_path):
            print(manual_val_path, ": Does not exist")
            continue

        def load_vector(fp):
            with open(fp, 'rb') as rf:
                vec = pk.load(rf)
            return vec
        
        X_orig = np.hstack([load_vector(fp) for fp in feature_paths])
        
        # The manual validation set has no 0 class items
        y_orig = load_vector(gt_path).values
        mask_gt = y_orig >= 0
        
        y_manual_val= load_vector(manual_val_path).values
        mask_manual_val = y_manual_val >= 0
        
        mask_train = mask_gt & ~mask_manual_val
        mask_test = mask_manual_val

        assert any(mask_train) and any(mask_test)
        
        y_train = y_orig[mask_train]
        X_train = X_orig[mask_train]
        
        y_test = y_manual_val[mask_test]
        X_test = X_orig[mask_test]
        
        # print(np.unique(y_train), np.unique(y_test))
        
        et = LGBMClassifier(n_estimators=100, min_data_in_leaf=500,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1, verbose=-1)
        et.fit(X_train, y_train)
        preds = et.predict(X_test)
        f1 = f1_score(y_test,preds, average='macro')
        
        probs = et.predict_proba(X_test)
        
        roc_auc = roc_auc_score(y_test,probs, multi_class='ovo')
        # print(np.unique(y_orig, return_counts=True))
        # print(confusion_matrix(y_test,preds))
        # print(f1)
        # print(roc_auc)
        
        performance_validation['f1'][gt][feature_set] = f1
        performance_validation['roc_auc'][gt][feature_set] = roc_auc

qanda _per_user ('use',) HASHTAG
qanda _per_user ('use',) URL_LR
qanda _per_user ('use',) POLITICIAN_LR
qanda _per_user ('use',) POLITICIAN_1H_LR
qanda _per_user ('use',) PARTY_FOLLOWER_LR
qanda _per_user ('ht',) HASHTAG








qanda _per_user ('ht',) URL_LR








qanda _per_user ('ht',) POLITICIAN_LR




qanda _per_user ('ht',) POLITICIAN_1H_LR




qanda _per_user ('ht',) PARTY_FOLLOWER_LR








qanda _per_user ('rt',) HASHTAG
qanda _per_user ('rt',) URL_LR
qanda _per_user ('rt',) POLITICIAN_LR
qanda _per_user ('rt',) POLITICIAN_1H_LR
qanda _per_user ('rt',) PARTY_FOLLOWER_LR
qanda _per_user ('use', 'ht') HASHTAG








qanda _per_user ('use', 'ht') URL_LR








qanda _per_user ('use', 'ht') POLITICIAN_LR




qanda _per_user ('use', 'ht') POLITICIAN_1H_LR




qanda _per_user ('use', 'ht') PARTY_FOLLOWER_LR








qanda _per_user ('use', 'rt') HASHTAG
qanda _per_user ('use', 'rt') URL_LR
qanda _per_user ('use', 'rt') POLITICIAN_LR
qanda _per_user ('use', 'rt') POLITICIAN_1H_LR
qanda _per_user ('use', 'rt') PARTY_FOLLOWER_LR
qanda _per_user ('ht', 'rt') HASHTAG








qanda _per_user ('ht', 'rt') URL_LR








qanda _per_user ('ht', 'rt') POLITICIAN_LR




qanda _per_user ('ht', 'rt') POLITICIAN_1H_LR




qanda _per_user ('ht', 'rt') PARTY_FOLLOWER_LR








qanda _per_user ('use', 'ht', 'rt') HASHTAG








qanda _per_user ('use', 'ht', 'rt') URL_LR


In [None]:
performance_manual_val_f1 = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['f1'].items()})
performance_manual_val_roc_auc = pd.DataFrame({k : {'+'.join(list(k1)) : v1 for k1,v1 in v.items()} for k,v in performance_validation['roc_auc'].items()})

In [None]:
performance_manual_val_f1.to_csv(os.path.join(base_dir, 'data', '04_results', 'ablation_qanda_manual_val_f1.csv'))
performance_manual_val_roc_auc.to_csv(os.path.join(base_dir, 'data', '04_results', 'ablation_qanda_manual_val_auc.csv'))