In [7]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
import pickle as pk
import numpy as np
from pathlib import Path
base_dir = os.getenv('BASEDIR')

from tqdm import tqdm
tqdm.pandas()

### Prep Hashtag Ground Truth

In [55]:
hashtag_labels = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','qanda_labelled_hashtags.csv'))
def assign_polarity(e):
    if(e in {'left', 'more left', 'left?', 'left_'}):
        return -1
    elif(e in ['right', 'more right', 'right?']):
        return 1
    else:
        return 0

hashtag_labels['polarity'] = hashtag_labels['label'].apply(assign_polarity)
hashtag_labels = hashtag_labels[~hashtag_labels['polarity'].isna()]

left_hashtags = hashtag_labels[hashtag_labels['polarity'] == -1]['hashtags']
right_hashtags = hashtag_labels[hashtag_labels['polarity'] == 1]['hashtags']

def ground_truth_hashtag(ht):
    if ht in left_hashtags.values:
        return -1
    elif ht in right_hashtags.values:
        return 1
    else:
        return 0
    
def handle_labels(s):
    if np.isnan(s):
        return -1
    elif s > 0:
        return 2
    elif s == 0:
        return 0
    elif s < 0:
        return 1

In [56]:
datasets = ['qanda']

In [58]:
for dataset in datasets:
    with open(os.path.join(base_dir,'data','01_raw_data',dataset, dataset+'_per_user'+'.pk'), 'rb') as rf:
        data = pk.load(rf)
        data_hashtags = data['hashtags']
        ht_gt = data_hashtags.apply(lambda l: np.nanmean(list(map(ground_truth_hashtag, l)))).apply(handle_labels)
        Path( os.path.join(base_dir,'data','03_processed',dataset,'ground_truth') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_HASHTAG__per_user.pk'), 'wb') as wf:
        pk.dump(ht_gt, wf)

  ht_gt = data_hashtags.apply(lambda l: np.nanmean(list(map(ground_truth_hashtag, l)))).apply(handle_labels)


### Prep URL Ground Truth

In [8]:
domain_lr = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','url_data','domain_allsides_reuters_lr.csv')).set_index('domain')

In [9]:
import tldextract
def extract_domain(url):
    ext = tldextract.extract(url)
    return('.'.join([ext.domain, ext.suffix]))

def handle_labels(s):
    if np.isnan(s):
        return -1
    elif s > 0:
        return 2
    elif s == 0:
        return 0
    elif s < -1*0:
        return 1
    else:
        return -1
    
def get_url_ideology(full_url):
    domain = extract_domain(full_url)
    try:
        return(domain_lr.loc[domain].stance)
    except Exception as e:
        return(np.nan)

In [10]:
datasets=['qanda']

In [11]:
# datasets = ['qanda', 'ausvotes', 'riot', 'parler', 'socialsense']

In [12]:
granularity = '_per_user'
for dataset in datasets:
    data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
    with open(data_path, 'rb') as rf:
        data = pk.load(rf)
        url_ideology = data['urls'].progress_apply(lambda l: np.nanmean([get_url_ideology(e) for e in l])).apply(handle_labels)
    Path( os.path.join(base_dir,'data','03_processed',dataset,'ground_truth') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_URL_LR_'+granularity+'.pk'), 'wb') as wf:
        pk.dump(url_ideology, wf)

  url_ideology = data['urls'].progress_apply(lambda l: np.nanmean([get_url_ideology(e) for e in l])).apply(handle_labels)
100%|███████████████████████████████████████████████████████| 103074/103074 [00:07<00:00, 13693.08it/s]


### Prep FR URL Ground Truth

In [9]:
domain_lr = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','url_data','domain_allsides_reuters_lr.csv')).set_index('domain')

In [10]:
import tldextract
def extract_domain(url):
    ext = tldextract.extract(url)
    return('.'.join([ext.domain, ext.suffix]))

def handle_labels(s):
    return s > 0.5
    
def get_url_ideology(full_url):
    domain = extract_domain(full_url)
    try:
        return(domain_lr.loc[domain].stance)
    except Exception as e:
        return(np.nan)

In [11]:
datasets = ['qanda', 'ausvotes', 'riot', 'parler', 'socialsense']

In [12]:
granularity = '_per_user'
for dataset in datasets:
    data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
    with open(data_path, 'rb') as rf:
        data = pk.load(rf)
        url_ideology = data['urls'].progress_apply(lambda l: np.nanmean([get_url_ideology(e) for e in l])).apply(handle_labels)
    Path( os.path.join(base_dir,'data','03_processed',dataset,'ground_truth') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_URLa_FR_'+granularity+'.pk'), 'wb') as wf:
        pk.dump(url_ideology, wf)

  url_ideology = data['urls'].progress_apply(lambda l: np.nanmean([get_url_ideology(e) for e in l])).apply(handle_labels)
100%|███████████████████████████████████████████████████████| 103074/103074 [00:07<00:00, 13949.00it/s]
100%|████████████████████████████████████████████████████████| 273874/273874 [01:48<00:00, 2534.07it/s]
100%|███████████████████████████████████████████████████████| 574281/574281 [00:43<00:00, 13305.19it/s]
100%|███████████████████████████████████████████████████████| 120048/120048 [00:04<00:00, 26270.61it/s]
100%|█████████████████████████████████████████████████████████| 49442/49442 [00:03<00:00, 12805.89it/s]


### Prep FR URL MBFC Ground Truth

In [85]:
mbfc = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','url_data','mbfc','mbfc_poltical.tsv'), delimiter='\t')

In [86]:
mbfc

Unnamed: 0,source_url,source_url_normalized,ref,fact,bias
0,https://crooked.com,crooked.com,https://mediabiasfactcheck.com/crooked-media/,high,left
1,http://deepleftfield.info,deepleftfield.info,https://mediabiasfactcheck.com/deep-left-field/,mixed,left
2,https://antifascistnews.net,antifascistnews.net,https://mediabiasfactcheck.com/anti-fascist-news/,high,left
3,http://www.cnn.com,cnn.com,http://mediabiasfactcheck.com/cnn/,mixed,left
4,http://www.allthatsfab.com,allthatsfab.com,http://mediabiasfactcheck.com/all-thats-fab/,mixed,left
...,...,...,...,...,...
854,http://www.unz.com,unz.com,https://mediabiasfactcheck.com/the-unz-report/,low,right
855,http://www.westernsentinel.com,westernsentinel.com,https://mediabiasfactcheck.com/western-sentinel/,low,right
856,http://www.uschronicle.com,uschronicle.com,http://mediabiasfactcheck.com/us-chronicle/,low,right
857,https://www.thepublicdiscourse.com,thepublicdiscourse.com,https://mediabiasfactcheck.com/witherspoon-ins...,low,right


In [13]:
mbfc = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','url_data','mbfc','mbfc_poltical.tsv'), delimiter='\t')
mbfc = mbfc[mbfc['bias'] == 'right']
mbfc['domain'] = mbfc['source_url_normalized']
mbfc['stance'] = 1.0
mbfc = mbfc.set_index('domain')

In [14]:
import tldextract
def extract_domain(url):
    ext = tldextract.extract(url)
    return('.'.join([ext.domain, ext.suffix]))

def handle_labels(s):
    return s > 0.5
    
def get_url_ideology(full_url):
    domain = extract_domain(full_url)
    try:
        return(mbfc.loc[domain].stance)
    except Exception as e:
        return(np.nan)

In [15]:
datasets = ['qanda', 'ausvotes', 'riot', 'parler', 'socialsense']

In [16]:
granularity = '_per_user'
for dataset in datasets:
    data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
    with open(data_path, 'rb') as rf:
        data = pk.load(rf)
        url_ideology = data['urls'].progress_apply(lambda l: np.nanmean([get_url_ideology(e) for e in l])).apply(handle_labels)
    Path( os.path.join(base_dir,'data','03_processed',dataset,'ground_truth') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_URLb_FR_'+granularity+'.pk'), 'wb') as wf:
        pk.dump(url_ideology, wf)

  url_ideology = data['urls'].progress_apply(lambda l: np.nanmean([get_url_ideology(e) for e in l])).apply(handle_labels)
100%|███████████████████████████████████████████████████████| 103074/103074 [00:05<00:00, 17251.44it/s]
100%|████████████████████████████████████████████████████████| 273874/273874 [01:14<00:00, 3689.48it/s]
100%|███████████████████████████████████████████████████████| 574281/574281 [00:39<00:00, 14624.19it/s]
100%|███████████████████████████████████████████████████████| 120048/120048 [00:04<00:00, 26234.93it/s]
100%|█████████████████████████████████████████████████████████| 49442/49442 [00:02<00:00, 18733.86it/s]


### Prep FR Seed User Ground Truth

In [17]:
far_right_df = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','far-right-users','far_right_coded_users.csv'), dtype=str)
recent_far_right_df = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','far-right-users','auspol-2022-accounts_to_monitor.csv'), dtype=str)
far_right_set = set(far_right_df['user_id']).union(recent_far_right_df['user_id'])

In [18]:
datasets = ['qanda', 'ausvotes', 'riot', 'parler', 'socialsense']

In [19]:
granularity = '_per_user'
for dataset in datasets:
    data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
    with open(data_path, 'rb') as rf:
        data = pk.load(rf).reset_index(drop=False)
        fr_seed_gt =  data['uid'].isin(far_right_set)
    Path( os.path.join(base_dir,'data','03_processed',dataset,'ground_truth') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_USER_FR_'+granularity+'.pk'), 'wb') as wf:
        pk.dump(fr_seed_gt, wf)

### Prep TPD Ground Truth

In [43]:
tpd_data = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','full_member_info.csv'),encoding = 'utf16', dtype=str)

In [44]:
tpd_data = tpd_data[tpd_data['country'] == 'Australia']
tpd_data = tpd_data[~tpd_data['uid'].isna()]

In [45]:
tpd_data = tpd_data[['name', 'party_id', 'party', 'uid']]

In [46]:
def get_ideology(pi):
    if pi == '464':
        return 2
    if pi == '465':
        return 1
    if pi == '467':
        return 2
    if pi == '468':
        return 0
    if pi == '469':
        return 1
    if pi == '471':
        return 0
    if pi == '475':
        return 2

In [47]:
tpd_data['stance'] = tpd_data['party_id'].apply(get_ideology)

In [48]:
tpd_data = tpd_data.set_index('uid')

In [49]:
datasets = ['qanda', 'ausvotes']

In [50]:
def get_politician_ideology(uid):
    try:
        return tpd_data.loc[uid].stance
    except Exception as e:
        return -1

In [51]:
granularity = '_per_user'
for dataset in datasets:
    data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
    with open(data_path, 'rb') as rf:
        data = pk.load(rf).reset_index(drop=False)
        tpd_gt =  data['uid'].apply(get_politician_ideology)
    Path( os.path.join(base_dir,'data','03_processed',dataset,'ground_truth') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_POLITICIAN_LR_'+granularity+'.pk'), 'wb') as wf:
        pk.dump(tpd_gt, wf)

In [53]:
from collections import Counter
granularity = '_per_user'
for dataset in datasets:
    data_path_per_user = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+'_per_user'+'.pk')
    data_path_per_post = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+'_per_post'+'.pk')
    with open(data_path_per_user, 'rb') as rf:
        data_per_user = pk.load(rf).reset_index(drop=False)
        tpd_gt =  data_per_user['uid'].apply(get_politician_ideology)
    with open(data_path_per_post, 'rb') as rf:
        data_per_post = pk.load(rf).reset_index(drop=False)
        data_per_post['stance'] = data_per_post['uid'].apply(get_politician_ideology)

        def most_common(lst):
            data = Counter(lst)
            return max(lst, key=data.get)

        rid_stance_index = data_per_post[~data_per_post['stance'].isna()].groupby('rid').apply(lambda d: most_common(d['stance']))
        def get_retweet_ideology(rid):
            try:
                return rid_stance_index.loc[rid]
            except Exception as e:
                return pd.NA
        data_per_post['stance'] = data_per_post['rid'].apply(get_retweet_ideology)

        uid_stance_index = data_per_post.groupby('uid').apply(lambda d: most_common(d['stance']))
        uid_stance_index = uid_stance_index[~uid_stance_index.isna()]
        def get_politician_1h_ideology(uid):
            try:
                return uid_stance_index.loc[uid]
            except Exception as e:
                return -1
        tpd_1h_gt =  data_per_user['uid'].apply(get_politician_1h_ideology)
        Path( os.path.join(base_dir,'data','03_processed',dataset,'ground_truth') ).mkdir( parents=True, exist_ok=True )
        with open(os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_POLITICIAN_1H_LR_'+'_per_user'+'.pk'), 'wb') as wf:
            pk.dump(tpd_1h_gt, wf)

### Prep Party Followers Ground Truth

In [29]:
party_followers = pd.read_csv(os.path.join(base_dir, 'data','02_ground_truth_data','party_followers','data.csv'), names=['party','follower'], dtype='str')
# .set_index('follower')

In [30]:
multiparty_followers = party_followers.groupby('follower')['party'].count()

In [31]:
single_party_followers = multiparty_followers[multiparty_followers ==1].index.values

In [32]:
party_followers = party_followers[party_followers.follower.isin(single_party_followers)].set_index('follower')

In [33]:
def get_ideology(party):
    if party == 'Climate200':
        return 1
    if party == 'Greens':
        return 1
    if party == 'AustralianLabor':
        return 1
    if party == 'centre_alliance':
        return 0
    if party == 'LambieNetwork':
        return 0
    if party == 'LiberalAus':
        return 2
    if party == 'The_Nationals':
        return 2
    if party == 'UnitedAusParty':
        return 2
    if party == 'KAPteam':
        return 2
    if party == 'OneNationAus':
        return 2

In [34]:
party_followers['stance'] = party_followers.party.apply(get_ideology)

In [35]:
datasets = ['qanda', 'ausvotes', 'riot', 'parler', 'socialsense']

In [36]:
def get_party_follower_ideology(uid):
    try:
        return party_followers.loc[uid].stance
    except Exception as e:
        return -1

In [37]:
granularity = '_per_user'
for dataset in datasets:
    data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
    with open(data_path, 'rb') as rf:
        data = pk.load(rf).reset_index(drop=False)
        merged_df = pd.merge(data, party_followers[['stance']],right_index=True,how='left', left_on='uid')
        # merged_df = merged_df[~merged_df.index.duplicated()]
        party_follower_gt = merged_df[~merged_df.index.duplicated()]['stance'].fillna(-1)
        # party_follower_gt =  data['uid'].progress_apply(get_party_follower_ideology)
        # party_follower_gt = np.array(Parallel(n_jobs=40, verbose=1)(delayed(get_party_follower_ideology)(uid) for uid in data['uid'].to_list()))
    Path( os.path.join(base_dir,'data','03_processed',dataset,'ground_truth') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_PARTY_FOLLOWER_LR_'+granularity+'.pk'), 'wb') as wf:
        pk.dump(party_follower_gt, wf)

### Prep Validation Ground Truth

In [21]:
manual_validation = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','manual_validation','qanda_validated_2.csv'), dtype=str)
manual_validation['stance'] = manual_validation.label.astype(int)
manual_validation['uid'] = manual_validation.UID

In [22]:
# manual_validation = pd.read_csv(os.path.join(base_dir,'data','02_ground_truth_data','qanda_manual_validation.csv'), dtype=str)
# manual_validation['stance'] = manual_validation.stance.astype(int)
# # manual_validation.loc[manual_validation['stance'] == -1,'stance'] = 0

In [23]:
manual_validation = manual_validation[manual_validation['stance'] > -1][['uid','stance']]

In [24]:
datasets = ['qanda']

In [25]:
# for dataset in datasets:
dataset='qanda'
with open(os.path.join(base_dir,'data','01_raw_data',dataset, dataset+'_per_user'+'.pk'), 'rb') as rf:
    data = pk.load(rf)
    manual_val = pd.merge(data,manual_validation, on='uid', how='left').stance.fillna(-1)
    Path( os.path.join(base_dir,'data','03_processed',dataset,'ground_truth') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'ground_truth', dataset+'_MANUAL_VALIDATION_LR_'+'_per_user'+'.pk'), 'wb') as wf:
        pk.dump(manual_val, wf)