# Generate the Manual Validation Set

In [121]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
import pickle as pk
import numpy as np
from pathlib import Path
base_dir = os.getenv('BASEDIR')

from tqdm import tqdm
tqdm.pandas()

from lightgbm import LGBMClassifier
from itertools import chain, combinations

In [122]:
import pandas as pd
import os 
import re
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.datasets import make_classification

import os
from collections import defaultdict

In [123]:
lr_groundtruths =['HASHTAG','URL_LR','POLITICIAN_1H_LR', 'PARTY_FOLLOWER_LR']

In [124]:
def load_vector(fp):
    with open(fp, 'rb') as rf:
        vec = pk.load(rf)
    return vec

In [125]:
predictions = {}

In [126]:
granularity = '_per_user'
feature = 'use'
dataset='qanda'
for gt in lr_groundtruths:
    train_path = os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_'+gt+'_'+granularity+'.pk')
    y_train = load_vector(train_path).values
    mask = y_train >= 0
    if any(mask):
        y = y_train[mask]
        feature_path = os.path.join(base_dir,'data','03_processed',dataset,'features', dataset+'_'+feature+'_'+granularity+'.pk')
        X_orig = load_vector(feature_path)
        X = X_orig[mask]
        et = LGBMClassifier(n_estimators=100,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1, random_seed=123)
        et.fit(X,y)
        preds = et.predict_proba(X_orig)
        predictions[gt] = preds

In [127]:
with open(os.path.join(base_dir,'data','01_raw_data',dataset,dataset+'_per_user.pk'), 'rb') as rf:
    data = pk.load(rf).reset_index(drop=False)

In [128]:
left_scores = pd.DataFrame({'uid':data.uid.values, **{k: v[:,1] for k, v in predictions.items()}})

In [129]:
right_scores = pd.DataFrame({'uid':data.uid.values, **{k: v[:,2] for k, v in predictions.items()}})

In [130]:
predictions_df = pd.merge(left_scores, right_scores, on='uid', suffixes=('_left', '_right'))

In [131]:
uids = set()
for gt in lr_groundtruths:
    for side in ['left', 'right']:
        uids = uids | set(predictions_df.sort_values('_'.join([gt,side]), ascending=False).uid.iloc[:100].values)

In [132]:
Path( os.path.join(base_dir,'data','02_ground_truth_data','manual_validation') ).mkdir( parents=True, exist_ok=True )

In [133]:
pd.Series(list(uids)).sample(frac=1.0, random_state=123).to_csv(os.path.join(base_dir,'data','02_ground_truth_data','manual_validation', 'qanda_to_validate.txt'), header=False, index=False)