In [11]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
import pickle as pk
import numpy as np
from pathlib import Path
base_dir = os.getenv('BASEDIR')

from tqdm import tqdm
tqdm.pandas()

In [12]:
import pandas as pd
import os 
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub

### Generate Hashtag Features

In [13]:
datasets = ['qanda']

In [14]:
def dummy_fun(doc):
    return doc

granularity = '_per_user'
for dataset in datasets:
    data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
    with open(data_path, 'rb') as rf:
        data = pk.load(rf)
        data_hashtags = data['hashtags']
        tfidf = TfidfVectorizer(
            analyzer='word',
            tokenizer=dummy_fun,
            preprocessor=dummy_fun,
            token_pattern=None,
            min_df = 10)
        ht_embeddings = tfidf.fit_transform(data_hashtags.apply(lambda l: [str(s) for s in l])).todense()
        Path( os.path.join(base_dir,'data','03_processed','qanda','features') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed','qanda','features', dataset+'_ht_'+granularity+'.pk'), 'wb') as wf:
        pk.dump(ht_embeddings, wf)

### Generate Retweet Features

In [15]:
datasets = ['qanda', 'ausvotes', 'riot']

In [17]:
granularity = '_per_user'
for dataset in datasets:
    data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
    post_data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+'_per_post'+'.pk')
    with open(data_path, 'rb') as rf, open(post_data_path, 'rb') as prf:
        data = pk.load(rf)
        post_level_data = pk.load(prf)
        most_popular_threads = post_level_data['rid'].value_counts().iloc[0:1000].index.values
        if granularity != 'per_post':
            def to_bow(rids):
                res = np.zeros(1000)
                for i, thread in enumerate(most_popular_threads):
                    if thread in rids:
                        res[i] += 1
                return res

            retweet_embeddings = data['rid'].progress_apply(to_bow) 
        else:
            def to_bow(rid):
                res = np.zeros(1000)
                for i, thread in enumerate(most_popular_threads):
                    if rid == thread:
                        res[i] += 1
                return res

            retweet_embeddings = data['rid'].progress_apply(to_bow)
    rt_embeddings = np.vstack(retweet_embeddings.values)

    Path( os.path.join(base_dir,'data','03_processed',dataset,'features') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'features', dataset+'_rt_'+granularity+'.pk'), 'wb') as wf:
        pk.dump(rt_embeddings, wf)

100%|████████████████████████████████████████████████████████| 103074/103074 [00:23<00:00, 4407.28it/s]
100%|████████████████████████████████████████████████████████| 273874/273874 [02:30<00:00, 1817.63it/s]
100%|███████████████████████████████████████████████████████| 574281/574281 [00:53<00:00, 10773.75it/s]


### Generate Lexical USE Embeddings

In [18]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
    return model(input)

def batched_embed(l_text):
    all_embeddings = []
    chunk_size = 10
    for i in tqdm(range(0, len(l_text), chunk_size)):
        chunk = l_text[i: min(i+chunk_size, len(l_text))]
        emb = embed(chunk)
        all_embeddings.append(emb.numpy())
    return np.vstack(all_embeddings)

2022-11-08 09:21:59.260381: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-08 09:21:59.261854: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-11-08 09:21:59.263388: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-11-08 09:21:59.265034: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory
2022-11-08 09:21:59.267946: W tensorflow/stream_executor/platform/default/dso_loader.cc:64

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [19]:
datasets = ['qanda', 'ausvotes', 'riot', 'parler', 'socialsense']

In [20]:
granularity = '_per_user'
for dataset in datasets:
    data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
    with open(data_path, 'rb') as rf:
        data = pk.load(rf)
        use_embeddings = batched_embed(list(data['text']))
        Path( os.path.join(base_dir,'data','03_processed',dataset,'features') ).mkdir( parents=True, exist_ok=True )
    with open(os.path.join(base_dir,'data','03_processed',dataset,'features', dataset+'_use_'+granularity+'.pk'), 'wb') as wf:
        pk.dump(use_embeddings, wf)

100%|████████████████████████████████████████████████████████████| 10308/10308 [02:45<00:00, 62.25it/s]
100%|████████████████████████████████████████████████████████████| 27388/27388 [21:28<00:00, 21.26it/s]
100%|███████████████████████████████████████████████████████████| 57429/57429 [05:09<00:00, 185.43it/s]
100%|███████████████████████████████████████████████████████████| 12005/12005 [01:54<00:00, 104.90it/s]
100%|██████████████████████████████████████████████████████████████| 4945/4945 [01:38<00:00, 50.30it/s]
