In [1]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
from gdtm.helpers.common import load_flat_dataset

base_dir = os.getenv('BASEDIR')

is_twitter = True

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
import pandas as pd
import os 
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub

KeyboardInterrupt: 

In [None]:
fb_data = pd.read_csv(os.path.join(base_dir,'data','00_pio','fb_for_stance.csv'), dtype=str)

In [None]:
fb_data = pd.read_csv(os.path.join(base_dir,'data','00_pio','fb_for_stance.csv'), dtype=str)[['index','text','text_urls']]
fb_data['urls'] = fb_data['text_urls'].fillna('').apply(lambda s: s.strip().split(','))
fb_data['text'] = fb_data['text'].apply(lambda t: re.sub(r'http\S+', '', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","", t)).apply(lambda t: re.sub(r'@\S+', '', t))
fb_data['id'] = fb_data['index']

fb_data = fb_data[['id', 'text','urls']]

In [None]:
id_uid_index = pd.read_csv(os.path.join(base_dir,'data','00_pio','socialsense_id_uid_index.csv'), dtype=str, header=None, names=['id', 'cid','uid'])[['id','uid']]

In [None]:
tw_data = pd.read_csv(os.path.join(base_dir,'data','00_pio','tw_for_stance_unrolled.csv'), dtype=str)[['id', 'text','text_urls_unrolled']]
tw_data = pd.merge(tw_data,id_uid_index, how='left', on='id')
tw_data['urls'] = tw_data['text_urls_unrolled'].fillna('').apply(lambda s: s.strip().split(','))
tw_data['text'] = tw_data['text'].apply(lambda t: re.sub(r'http\S+', '', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","", t)).apply(lambda t: re.sub(r'@\S+', '', t))

tw_data = tw_data[['id', 'text','urls', 'uid']]

In [None]:
if is_twitter:
    data = tw_data
else:
    data = fb_data

### URL Ground Truth

In [None]:
import tldextract
def extract_domain(url):
    ext = tldextract.extract(url)
    return('.'.join([ext.domain, ext.suffix]))

In [None]:
data_domains = data['urls'].apply(lambda l: [extract_domain(e) for e in l])

In [None]:
## Is URL ideology correlated with Hashtag ideology?
reuters_domain_ideology = pd.read_csv(os.path.join(base_dir,'data','01_raw','reuters_2020_au_news_lr.csv'))
reuters_domain_ideology = reuters_domain_ideology[~reuters_domain_ideology['domain'].isna()].set_index('domain')

In [None]:
reuters_domain_ideology['lr'] = reuters_domain_ideology['lr'] - reuters_domain_ideology.loc['abc.net.au','lr']

In [None]:
# def get_domain_ideology_reuters(domain):
#     try:
#         return(reuters_domain_ideology.loc[domain].lr)
#     except Exception as e:
#         return(np.nan)
    
# users_with_url_ideology_gt = data_domains.apply(lambda l: np.nanmean([get_domain_ideology_reuters(e) for e in l])).apply(handle_labels)

In [None]:
allsides_df = pd.read_csv(os.path.join(base_dir,'data','02_processed','all_allsides_media_bias.csv'))[['News Source','AllSides Bias Rating','News Source Site']]

In [None]:
from urllib.parse import urlparse
def strip_scheme(url):
    parsed = urlparse(url)
    scheme = "%s://" % parsed.scheme
    return parsed.geturl().replace(scheme, '', 1)

In [None]:
allsides_df = allsides_df[allsides_df['AllSides Bias Rating'] != 'allsides']
allsides_df = allsides_df[~allsides_df['News Source Site'].isna()]
allsides_df['stance'] = allsides_df['AllSides Bias Rating'].progress_apply(lambda r: {'left center':-0.5, 'left':-1.0,'center':0.0,'right center':0.5, 'right':1.0}[r])
allsides_df['url'] = allsides_df['News Source Site'].apply(strip_scheme)

In [None]:
def get_news_source(full_url):
    for start_url, news_source in zip(allsides_df['url'],allsides_df['News Source']):
        if strip_scheme(full_url).startswith(start_url):
            return news_source

def get_allsides_stance(full_url):
    for start_url, stance in zip(allsides_df['url'],allsides_df['stance']):
        if strip_scheme(full_url).startswith(start_url):
            return stance
    raise Exception('URL not in allsides dataset')

In [None]:
def get_url_ideology(full_url):
    domain = extract_domain(full_url)
    try:
#         raise Exception('e')
        return(reuters_domain_ideology.loc[domain].lr)
    except Exception as e:
        try:
            return(get_allsides_stance(full_url))
#             return(allsides_df.loc[domain].stance)
        except Exception as e:
            return(np.nan)

In [None]:
def get_ideology_source(full_url):
    domain = extract_domain(full_url)
    try:
#         raise Exception('e')
        reuters_domain_ideology.loc[domain].lr
        return('Australian')
    except Exception as e:
        try:
            get_allsides_stance(full_url)
            return('American')
#             return(allsides_df.loc[domain].stance)
        except Exception as e:
            return(np.nan)

In [None]:
def handle_labels(s):
    if np.isnan(s):
        return -1
    elif s > 0:
        return 2
    elif s == 0:
        return 0
    elif s < 0:
        return 1

In [None]:
url_ideology = data['urls'].progress_apply(lambda l: np.nanmean([get_url_ideology(e) for e in l])).apply(handle_labels)

In [None]:
# url_ideology = users_with_url_ideology_gt.combine_first(allside_ideology_gt)

In [None]:
ground_truths = pd.DataFrame({'url_gt': url_ideology})

In [None]:
ground_truths['gt_label'] = ground_truths['url_gt']

### Generate Embeddings

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
    return model(input)

In [None]:
def batched_embed(l_text):
    all_embeddings = []
    chunk_size = 10
    for i in tqdm(range(0, len(l_text), chunk_size)):
        chunk = l_text[i: min(i+chunk_size, len(l_text))]
        emb = embed(chunk)
        all_embeddings.append(emb.numpy())
    return np.vstack(all_embeddings)

In [None]:
use_embeddings = batched_embed(list(data['text']))

In [None]:
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.datasets import make_classification

import os
from collections import defaultdict

In [None]:
lgbm_performance = {}

In [None]:
X_use = use_embeddings
use_embeddings.shape

In [None]:
y_orig = ground_truths['gt_label'].values

In [None]:
# ground_truths['url_gt'].value_counts()

In [None]:
mask = y_orig >= 0

## USE

In [None]:
feature_set = 'use'

In [None]:
X = X_use[mask]

In [None]:
y=y_orig[mask]

In [None]:
from lightgbm import LGBMClassifier

et = LGBMClassifier(n_estimators=100, min_data_in_leaf=500,colsample_bytree=0.8, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=4, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, X, y, cv=skf, method='predict_proba')

# evaluating the model
# print('Area under the ROC Curve:', roc_auc_score(y, preds[:,1]))
print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))

lgbm_performance[feature_set] = roc_auc_score(y, preds, multi_class='ovo')

In [None]:
et.fit(X, y)

In [None]:
predicted_labels = et.predict(X_use)
predicted_probs = et.predict_proba(X_use)

In [None]:
def handle_l_labels(l):
    if l == 2:
        return 1
    if l == 1:
        return -1
    if l <1:
        return 0

In [None]:
data['label'] = predicted_labels
data['label'] = data['label'].apply(handle_l_labels)

In [None]:
data['stance'] = (predicted_probs*[0,-1,1])[range(len(predicted_probs)),list(np.argmax(np.abs(predicted_probs*[0,-1,1]), axis=1))]

In [None]:
data.to_csv(os.path.join(base_dir,'data','00_pio','tw_w_stance.csv'))