In [231]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
from gdtm.helpers.common import load_flat_dataset

base_dir = os.getenv('BASEDIR')

is_per_episode = True
is_url_gt = True

In [232]:
from tqdm import tqdm
tqdm.pandas()

In [233]:
import pandas as pd
import os 
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub

In [234]:
data = pd.read_csv(os.path.join(base_dir,'data','01_raw','qanda_episodes.csv'), dtype=str)

data['hashtags'] = data['hashtags'].fillna('').apply(lambda s: s.split(';;;'))

data['mentions'] = data['mentions'].fillna('').apply(lambda s: s.split(';;;'))

data['urls'] = data['urls'].fillna('').apply(lambda s: s.split(';;;'))

data['text_ht_censored'] = data['text'].apply(lambda t: re.sub(r'http\S+', '<URL>', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","<HASHTAG>", t)) 
data['text'] = data['text'].apply(lambda t: re.sub(r'http\S+', '', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","", t)).apply(lambda t: re.sub(r'@\S+', '', t))

data['rid'] = data['rid'].combine_first(data['tid'])

In [235]:
post_level_data = data

In [236]:
if is_per_episode:
    data = data.groupby(['uid','episode']).progress_apply(lambda d: pd.DataFrame({'text' : ' '.join(d['text']),'hashtags': [[e for u in d['hashtags'] for e in u if e != '']],'rid':[list(d['rid'])], 'urls' : [[e for u in d['urls'] for e in u if e != '']] }))

100%|██████████| 275032/275032 [03:17<00:00, 1392.43it/s]


### Hashtag Ground Truth

In [237]:
hashtag_labels = pd.read_csv(os.path.join(base_dir,'data','01_raw','hashtags_labelled_100.csv'))
def assign_polarity(e):
    if(e in {'left', 'more left', 'left?', 'left_'}):
        return -1
    elif(e in ['right', 'more right', 'right?']):
        return 1
    else:
        return 0

hashtag_labels['polarity'] = hashtag_labels['label'].apply(assign_polarity)
hashtag_labels = hashtag_labels[~hashtag_labels['polarity'].isna()]

In [238]:
left_hashtags = hashtag_labels[hashtag_labels['polarity'] == -1]['hashtags']
right_hashtags = hashtag_labels[hashtag_labels['polarity'] == 1]['hashtags']

def ground_truth_hashtag(ht):
    if ht in left_hashtags.values:
        return -1
    elif ht in right_hashtags.values:
        return 1
    else:
        return 0

In [239]:
data_hashtags = data['hashtags']

ground_truth_df = pd.DataFrame({'hashtags' : data_hashtags})
ground_truth_df['ht_gt'] = data_hashtags.apply(lambda l: np.nanmean(list(map(ground_truth_hashtag, l))))
def handle_labels(s):
    if np.isnan(s):
        return -1
    elif s > 0:
        return 2
    elif s == 0:
        return 0
    elif s < 0:
        return 1
ground_truth_df['ht_gt'] = ground_truth_df['ht_gt'].apply(handle_labels)

  ground_truth_df['ht_gt'] = data_hashtags.apply(lambda l: np.nanmean(list(map(ground_truth_hashtag, l))))


### URL Ground Truth

In [240]:
import tldextract
def extract_domain(url):
    ext = tldextract.extract(url)
    return('.'.join([ext.domain, ext.suffix]))

In [241]:
data_domains = data['urls'].apply(lambda l: [extract_domain(e) for e in l])

In [242]:
## Is URL ideology correlated with Hashtag ideology?
reuters_domain_ideology = pd.read_csv(os.path.join(base_dir,'data','01_raw','reuters_2020_au_news_lr.csv'))
reuters_domain_ideology = reuters_domain_ideology[~reuters_domain_ideology['domain'].isna()].set_index('domain')

In [243]:
reuters_domain_ideology['lr'] = reuters_domain_ideology['lr'] - reuters_domain_ideology.loc['abc.net.au','lr']

In [244]:
# def get_domain_ideology_reuters(domain):
#     try:
#         return(reuters_domain_ideology.loc[domain].lr)
#     except Exception as e:
#         return(np.nan)
    
# users_with_url_ideology_gt = data_domains.apply(lambda l: np.nanmean([get_domain_ideology_reuters(e) for e in l])).apply(handle_labels)

In [245]:
allsides_df = pd.read_csv(os.path.join(base_dir,'data','02_processed','all_allsides_media_bias.csv'))[['News Source','AllSides Bias Rating','News Source Site']]

In [246]:
from urllib.parse import urlparse
def strip_scheme(url):
    parsed = urlparse(url)
    scheme = "%s://" % parsed.scheme
    return parsed.geturl().replace(scheme, '', 1)

In [247]:
allsides_df = allsides_df[allsides_df['AllSides Bias Rating'] != 'allsides']
allsides_df = allsides_df[~allsides_df['News Source Site'].isna()]
allsides_df['stance'] = allsides_df['AllSides Bias Rating'].progress_apply(lambda r: {'left center':-0.5, 'left':-1.0,'center':0.0,'right center':0.5, 'right':1.0}[r])
allsides_df['url'] = allsides_df['News Source Site'].apply(strip_scheme)

100%|██████████| 478/478 [00:00<00:00, 503105.98it/s]


In [248]:
def get_news_source(full_url):
    for start_url, news_source in zip(allsides_df['url'],allsides_df['News Source']):
        if strip_scheme(full_url).startswith(start_url):
            return news_source

def get_allsides_stance(full_url):
    for start_url, stance in zip(allsides_df['url'],allsides_df['stance']):
        if strip_scheme(full_url).startswith(start_url):
            return stance
    raise Exception('URL not in allsides dataset')

In [249]:
def get_url_ideology(full_url):
    domain = extract_domain(full_url)
    try:
#         raise Exception('e')
        return(reuters_domain_ideology.loc[domain].lr)
    except Exception as e:
        try:
            return(get_allsides_stance(full_url))
#             return(allsides_df.loc[domain].stance)
        except Exception as e:
            return(np.nan)

In [250]:
def get_ideology_source(full_url):
    domain = extract_domain(full_url)
    try:
#         raise Exception('e')
        reuters_domain_ideology.loc[domain].lr
        return('Australian')
    except Exception as e:
        try:
            get_allsides_stance(full_url)
            return('American')
#             return(allsides_df.loc[domain].stance)
        except Exception as e:
            return(np.nan)

In [251]:
url_ideology = data['urls'].progress_apply(lambda l: np.nanmean([get_url_ideology(e) for e in l])).apply(handle_labels)

  url_ideology = data['urls'].progress_apply(lambda l: np.nanmean([get_url_ideology(e) for e in l])).apply(handle_labels)
100%|██████████| 275032/275032 [01:16<00:00, 3598.97it/s]


In [252]:
# url_ideology = users_with_url_ideology_gt.combine_first(allside_ideology_gt)

In [253]:
ground_truths = pd.DataFrame({'ht_gt':ground_truth_df['ht_gt'],'url_gt': url_ideology})

In [254]:
ground_truths['gt_label'] = ground_truths[('url_gt' if is_url_gt else 'ht_gt')]

### Generate Embeddings

In [255]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
            analyzer='word',
            tokenizer=dummy_fun,
            preprocessor=dummy_fun,
            token_pattern=None,
            min_df = 10)

toy = data_hashtags

ht_embeddings = tfidf.fit_transform(toy.apply(lambda l: [str(s) for s in l])).todense()

In [256]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
    return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [257]:
def batched_embed(l_text):
    all_embeddings = []
    chunk_size = 10
    for i in tqdm(range(0, len(l_text), chunk_size)):
        chunk = l_text[i: min(i+chunk_size, len(l_text))]
        emb = embed(chunk)
        all_embeddings.append(emb.numpy())
    return np.vstack(all_embeddings)

In [258]:
use_embeddings = batched_embed(list(data['text']))

100%|██████████| 27504/27504 [03:24<00:00, 134.32it/s]


In [259]:
most_popular_threads = post_level_data['rid'].value_counts().iloc[0:1000].index.values
if is_per_episode:
    def to_bow(rids):
        res = np.zeros(1000)
        for i, thread in enumerate(most_popular_threads):
            if thread in rids:
                res[i] += 1
        return res
    
    retweet_embeddings = data['rid'].progress_apply(to_bow) 
else:
    def to_bow(rid):
        res = np.zeros(1000)
        for i, thread in enumerate(most_popular_threads):
            if rid == thread:
                res[i] += 1
        return res

    retweet_embeddings = data['rid'].progress_apply(to_bow)

100%|██████████| 275032/275032 [00:33<00:00, 8192.37it/s]


In [260]:
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.datasets import make_classification

import os
from collections import defaultdict

In [261]:
lgbm_performance = {}

In [262]:
X_use = use_embeddings
use_embeddings.shape

(275032, 512)

In [263]:
rt_embeddings = np.vstack(retweet_embeddings.values)

In [264]:
X_rt = rt_embeddings
rt_embeddings.shape

(275032, 1000)

In [265]:
X_ht = ht_embeddings
ht_embeddings.shape

(275032, 2837)

In [266]:
y_orig = ground_truths['gt_label'].values

In [267]:
# ground_truths['url_gt'].value_counts()

In [268]:
mask = y_orig >= 0

## USE

In [269]:
feature_set = 'use'

In [270]:
X = X_use[mask]

In [271]:
y=y_orig[mask]

In [272]:
from lightgbm import LGBMClassifier

et = LGBMClassifier(n_estimators=100, min_data_in_leaf=500,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=4, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, X, y, cv=skf, method='predict_proba')

# evaluating the model
# print('Area under the ROC Curve:', roc_auc_score(y, preds[:,1]))
print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))

lgbm_performance[feature_set] = roc_auc_score(y, preds, multi_class='ovo')

Area under the ROC Curve: 0.9530983934701641


In [273]:
et.fit(X, y)



LGBMClassifier(class_weight='balanced', colsample_bytree=0.8,
               min_data_in_leaf=500)

In [274]:
predicted_labels = et.predict(X_use)

In [278]:
predicted_probs = et.predict_proba(X_use)

In [275]:
def handle_l_labels(l):
    if l == 2:
        return 1
    if l == 1:
        return -1
    if l <1:
        return 0

In [276]:
data['label'] = predicted_labels
data['label'] = data['label'].apply(handle_l_labels)

In [279]:
data['stance'] = (predicted_probs*[0,-1,1])[range(len(predicted_probs)),list(np.argmax(np.abs(predicted_probs*[0,-1,1]), axis=1))]

In [280]:
data.to_csv(os.path.join(base_dir,'data','02_processed','qanda_episode_w_labels.csv'))

## Hashtags

In [212]:
feature_set = 'ht'

In [213]:
X = X_ht[mask]

In [214]:
y=y_orig[mask]

In [215]:
from lightgbm import LGBMClassifier

et = LGBMClassifier(n_estimators=100, min_data_in_leaf=500,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=4, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, X, y, cv=skf, method='predict_proba')

# evaluating the model
# print('Area under the ROC Curve:', roc_auc_score(y, preds[:,1]))
print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))

lgbm_performance[feature_set] = roc_auc_score(y, preds, multi_class='ovo')















Area under the ROC Curve: 0.7112458416441328




## Reshare

In [216]:
feature_set = 'rt'

In [217]:
X = X_rt[mask]

In [218]:
y=y_orig[mask]

In [219]:
from lightgbm import LGBMClassifier

et = LGBMClassifier(n_estimators=100, min_data_in_leaf=500,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=4, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, X, y, cv=skf, method='predict_proba')

# evaluating the model
# print('Area under the ROC Curve:', roc_auc_score(y, preds[:,1]))
print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))

lgbm_performance[feature_set] = roc_auc_score(y, preds, multi_class='ovo')

Area under the ROC Curve: 0.5005678868700713


## Hashtag + USE

In [220]:
feature_set = 'ht_use'

In [221]:
X = np.hstack([X_ht,X_use])[mask]

In [222]:
y=y_orig[mask]

In [223]:
from lightgbm import LGBMClassifier

et = LGBMClassifier(n_estimators=100, min_data_in_leaf=500,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=2, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, X, y, cv=skf, method='predict_proba')

# evaluating the model
# print('Area under the ROC Curve:', roc_auc_score(y, preds[:,1]))
print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))
lgbm_performance[feature_set] = roc_auc_score(y, preds, multi_class='ovo')







Area under the ROC Curve: 0.9327524344815968




## Retweet + Linguistic

In [224]:
feature_set = 'rt_use'

In [225]:
X = np.hstack([X_rt,X_use])[mask]
y = y_orig[mask]

from lightgbm import LGBMClassifier

et = LGBMClassifier(n_estimators=100, min_data_in_leaf=500,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=2, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, X, y, cv=skf, method='predict_proba')

# evaluating the model
# print('Area under the ROC Curve:', roc_auc_score(y, preds[:,1]))
print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))
lgbm_performance[feature_set] = roc_auc_score(y, preds, multi_class='ovo')

Area under the ROC Curve: 0.9313122589525502


## Retweet + Hashtag

In [226]:
feature_set = 'rt_ht'

In [227]:
X = np.hstack([X_rt,X_ht])[mask]
y = y_orig[mask]

from lightgbm import LGBMClassifier

et = LGBMClassifier(n_estimators=100, min_data_in_leaf=500,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=2, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, X, y, cv=skf, method='predict_proba')

# evaluating the model
# print('Area under the ROC Curve:', roc_auc_score(y, preds[:,1]))
print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))
lgbm_performance[feature_set] = roc_auc_score(y, preds, multi_class='ovo')







Area under the ROC Curve: 0.697399262925428




## Retweet + Linguistic + Hashtag

In [228]:
feature_set = 'rt_use_ht'

In [229]:
X = np.hstack([X_use, X_ht,X_rt])[mask]
y = y_orig[mask]

from lightgbm import LGBMClassifier

et = LGBMClassifier(n_estimators=100, min_data_in_leaf=500,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=2, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, X, y, cv=skf, method='predict_proba')

# evaluating the model
# print('Area under the ROC Curve:', roc_auc_score(y, preds[:,1]))
print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))
lgbm_performance[feature_set] = roc_auc_score(y, preds, multi_class='ovo')







Area under the ROC Curve: 0.9346886692236618




In [230]:
lgbm_performance

{'use': 0.9530879124346621,
 'ht': 0.7112458416441328,
 'rt': 0.5005678868700713,
 'ht_use': 0.9327524344815968,
 'rt_use': 0.9313122589525502,
 'rt_ht': 0.697399262925428,
 'rt_use_ht': 0.9346886692236618}