# Features 

## Content-agnostic features
* User statistics: Days since join, favourites count, follower count, friends counts, statuses count, user verified
* User profile description: SBERT encoding
* User following: Select top 50 accounts from each group, and get the union set of users to create a feature vector (number of dimensions is 95 in our case) 


## Content features
* Media: Using the URLs in users' tweets, create a vector  (the number of dimentions is 15 : 'extremeleft','left','leftcenter','center','rightcenter','right','extremeright','very high','high','mostly factual','mixed','low','very low','questionable source')
* NELA: https://github.com/BenjaminDHorne/Language-Features-for-News
* LIWC: No code for LIWC as I used the LIWC software. 
* Tweets: SBERT encoding 


## SBERT encoding 

For each user, we randomly sampled N tweets, where N is the median number of tweets per users. 

The input file needs to have at lest two fields: 'userid' and 'text' (tweet). 

We encode each tweet with SBERT and get the mean values to represent each user. 

In [None]:
import pandas as pd 
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [None]:
headers = ["userid"]
for cat in range(768):
    headers.append("s"+str(cat))

In [None]:
# note that you need your own input file, which needs at least two fields: tweetid and text. For each user, we have a sample set of tweets.
inputfilename = "YOUR_SAMPLED_TWEETS.tsv"
out_dir = "user_features/sbert_precovid"
outfilename = f'{out_dir}/userid2sbert_emd_YOUR_SAMPLED_TWEETS.tsv'

df = pd.read_table(inputfilename, sep="\t")
print(df.shape)
user_ids = df['user_id'].unique()
print(len(user_ids))

with open(outfilename, "w") as output:
    output.write("\t".join(headers)+"\n")

    for cnt, each_id in enumerate(user_ids):
        df_tmp = df.query("user_id == @each_id")

        sentence_embeddings = model.encode(list(df_tmp['text']))
        mean_embedding = sentence_embeddings.mean(axis=0)

        emb_feat = "\t".join([str(each) for each in mean_embedding])

        output.write(f'{each_id}\t{emb_feat}\n')

## NELA faeture

In [None]:
from nela_features.nela_features import NELAFeatureExtractor
import pandas as pd
import numpy as np

In [None]:
nela = NELAFeatureExtractor()

In [None]:
## To print out the header for the output feature file 
newsarticle = "Breaking News: Ireland Expected To Become World's First Country To Divest From Fossil Fuels ..." 
all_feature_names = []
feature_vector, feature_names = nela.extract_style(newsarticle) 
all_feature_names += feature_names
feature_vector, feature_names = nela.extract_complexity(newsarticle) 
all_feature_names += feature_names
feature_vector, feature_names = nela.extract_bias(newsarticle)
all_feature_names += feature_names
feature_vector, feature_names = nela.extract_affect(newsarticle) 
all_feature_names += feature_names
feature_vector, feature_names = nela.extract_moral(newsarticle)
all_feature_names += feature_names
# all_feature_names

In [None]:
def compute_nela(text):
    myvec = []
    feature_vector, feature_names = nela.extract_style(text) 
    myvec += feature_vector
    feature_vector, feature_names = nela.extract_complexity(text) 
    myvec += feature_vector
    feature_vector, feature_names = nela.extract_bias(text)
    myvec += feature_vector
    feature_vector, feature_names = nela.extract_affect(text) 
    myvec += feature_vector
    feature_vector, feature_names = nela.extract_moral(text)
    myvec += feature_vector
    
    return myvec

In [None]:
# note that you need your own input file, which needs at least two fields: tweetid and text. For each user, we have a sample set of tweets.
inputfilename = "YOUR_SAMPLED_TWEETS.tsv"

df = pd.read_table(inputfilename, sep="\t")
print(df.shape)
user_ids = df['user_id'].unique()
print(len(user_ids))

cnt_error = 0
outfilename = "userid2language_nela_features_YOUR_SAMPLED_TWEETS.tsv"
with open(outfilename, "w") as output:
    output.write("user_id\t%s\n" % ("\t".join(all_feature_names)))

    for each_id in user_ids:
        df_tmp = df.query("user_id == @each_id")

        list_vecs = []
        for text in list(df_tmp['text']):
            try:
                myvec = compute_nela(text)
                list_vecs.append(np.array(myvec))
            except:
                cnt_error +=1
                pass
            
        myvec_mean = np.array(list_vecs).mean(axis=0)

        mystr = "\t".join([str(each) for each in myvec_mean])
        output.write("%s\t%s\n" % (each_id, mystr))
print("cnt_error=", cnt_error)


## Media feature 


In [None]:
import pandas as pd
from tld import get_tld


#### Uploading data of media bias and facuality

In [None]:
df_mbfc = pd.read_csv("./mbfc_final_jisun_20200719.tsv", sep="\t")
print(df_mbfc.shape)
df_mbfc.head()

In [None]:
domain2bias = {}
domain2factual = {}

for index, row in df_mbfc.iterrows():
    try:
        domain = get_tld(row['source_url'], as_object=True).fld
        bias = row['bias']
        factual = row['factual']

        domain2bias[domain] = bias
        domain2factual[domain] = factual  
    except:
        continue


#### Map URLs to bias and facuality
1. Input file: expanded_urls_YOUR_FILE.tsv - an input file with the following fields: user_screen_name, tweet_id, url, expanded_url
2. Output file: expanded_urls_domain_bias_factual_YOUR_FILE.tsv - user_screen_name, tweet_id, domain, bias, factual

In [None]:
with open("expanded_urls_YOUR_FILE.tsv") as fi, open("expanded_urls_domain_bias_factual_YOUR_FILE.tsv", "w") as output:
    output.write("\t".join(['user_screen_name', 'tweet_id', 'domain', 'bias', 'factual'])+"\n")

    for line_cnt, line in enumerate(fi):
        
        user_screen_name, tweet_id, url, expanded_url = [term.strip() for term in line.split("\t")]
        
        try:
            if 'twitter.com' in expanded_url: ## it's easier to skip Twitter urls. 
                continue
                
            domain = get_tld(expanded_url, as_object=True).fld
            
            if domain in domain2bias:
                bias = domain2bias[domain]
                factual = domain2factual[domain]
                output.write("\t".join([user_screen_name, tweet_id, domain, bias, factual])+"\n")
        except:
            continue


#### Create a media feature vector per user

In [None]:
df_url = pd.read_csv("expanded_urls_domain_bias_factual_YOUR_FILE.tsv", sep="\t")
print("df_url=", df_url.shape)    
user_ids = df_url['user_id'].unique()
print(len(user_ids))

with open("urls_by_bias_precovid_YOUR_FILE.tsv", "w") as output:
    output.write("\t".join(["user_id", "extremeleft", "left", "leftcenter", "center", "rightcenter", "right", "extremeright", "very high", "high", "mostly factual", "mixed", "low", "very low", "questionable source"])+"\n")

    for each_id in user_ids:
        df_url_tmp = df_url.query("user_id == @each_id")
        dict_bias = df_url_tmp['bias'].value_counts().to_dict()
        dict_factual = df_url_tmp['factual'].value_counts().to_dict()

        result_bias = []
        for each in ["extremeleft", "left", "leftcenter", "center", "rightcenter", "right", "extremeright"]:
            if each in dict_bias:
                result_bias.append(str(dict_bias[each]))
            else:
                result_bias.append("0")

        result_factual = []

        for each in ["very high", "high", "mostly factual", "mixed", "low", "very low", "questionable source"]:
            if each in dict_factual:
                result_factual.append(str(dict_factual[each]))
            else:
                result_factual.append("0")

        myresult = [str(each_id)]
        myresult += result_bias
        myresult += result_factual

        output.write("\t".join(myresult)+"\n")
