In [4]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
import pickle as pk
import numpy as np
from pathlib import Path
base_dir = os.getenv('BASEDIR')

from tqdm import tqdm
tqdm.pandas()

In [5]:
import pandas as pd
import os 
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub

### Generate Hashtag Features

In [6]:
datasets = {'qanda' : ['_per_post','_per_episode', '_per_user']}

In [8]:
def dummy_fun(doc):
    return doc

for dataset in datasets.keys():
    for granularity in datasets[dataset]:
        data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
        with open(data_path, 'rb') as rf:
            data = pk.load(rf)
            data_hashtags = data['hashtags']
            tfidf = TfidfVectorizer(
                analyzer='word',
                tokenizer=dummy_fun,
                preprocessor=dummy_fun,
                token_pattern=None,
                min_df = 10)
            ht_embeddings = tfidf.fit_transform(data_hashtags.apply(lambda l: [str(s) for s in l])).todense()
            Path( os.path.join(base_dir,'data','03_processed','qanda','features') ).mkdir( parents=True, exist_ok=True )
        with open(os.path.join(base_dir,'data','03_processed','qanda','features', dataset+'_ht_'+granularity+'.pk'), 'wb') as wf:
            pk.dump(ht_embeddings, wf)

### Generate Retweet Features

In [9]:
datasets = {'qanda' : ['_per_post','_per_episode', '_per_user'], 'ausvotes' : ['_per_post', '_per_user']}

In [11]:
for dataset in datasets.keys():
    for granularity in datasets[dataset]:
        data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
        post_data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+'_per_post'+'.pk')
        with open(data_path, 'rb') as rf, open(post_data_path, 'rb') as prf:
            data = pk.load(rf)
            post_level_data = pk.load(prf)
            most_popular_threads = post_level_data['rid'].value_counts().iloc[0:1000].index.values
            if granularity != 'per_post':
                def to_bow(rids):
                    res = np.zeros(1000)
                    for i, thread in enumerate(most_popular_threads):
                        if thread in rids:
                            res[i] += 1
                    return res

                retweet_embeddings = data['rid'].progress_apply(to_bow) 
            else:
                def to_bow(rid):
                    res = np.zeros(1000)
                    for i, thread in enumerate(most_popular_threads):
                        if rid == thread:
                            res[i] += 1
                    return res

                retweet_embeddings = data['rid'].progress_apply(to_bow)
        rt_embeddings = np.vstack(retweet_embeddings.values)
        
        Path( os.path.join(base_dir,'data','03_processed',dataset,'features') ).mkdir( parents=True, exist_ok=True )
        with open(os.path.join(base_dir,'data','03_processed',dataset,'features', dataset+'_rt_'+granularity+'.pk'), 'wb') as wf:
            pk.dump(rt_embeddings, wf)

100%|██████████| 756640/756640 [00:56<00:00, 13497.60it/s]
100%|██████████| 275032/275032 [00:26<00:00, 10441.32it/s]
100%|██████████| 100114/100114 [00:16<00:00, 6167.31it/s]
100%|██████████| 4922572/4922572 [06:10<00:00, 13292.74it/s]
100%|██████████| 265350/265350 [01:24<00:00, 3131.49it/s]


### Generate Lexical USE Embeddings

In [12]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
    return model(input)

def batched_embed(l_text):
    all_embeddings = []
    chunk_size = 10
    for i in tqdm(range(0, len(l_text), chunk_size)):
        chunk = l_text[i: min(i+chunk_size, len(l_text))]
        emb = embed(chunk)
        all_embeddings.append(emb.numpy())
    return np.vstack(all_embeddings)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [13]:
datasets = {'qanda' : ['_per_post','_per_episode', '_per_user'], 'ausvotes' : ['_per_post', '_per_user'], 'socialsense' : ['_per_post', '_per_user']}

In [15]:
for dataset in datasets.keys():
    for granularity in datasets[dataset]:
        data_path = os.path.join(base_dir,'data','01_raw_data',dataset, dataset+granularity+'.pk')
        with open(data_path, 'rb') as rf:
            data = pk.load(rf)
            use_embeddings = batched_embed(list(data['text']))
            Path( os.path.join(base_dir,'data','03_processed',dataset,'features') ).mkdir( parents=True, exist_ok=True )
        with open(os.path.join(base_dir,'data','03_processed',dataset,'features', dataset+'_use_'+granularity+'.pk'), 'wb') as wf:
            pk.dump(use_embeddings, wf)

100%|██████████| 75664/75664 [06:34<00:00, 192.01it/s]
100%|██████████| 27504/27504 [02:55<00:00, 156.68it/s]
100%|██████████| 10012/10012 [02:06<00:00, 79.22it/s]
 79%|███████▉  | 389587/492258 [34:03<09:33, 178.89it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 26535/26535 [11:33<00:00, 38.26it/s]
100%|██████████| 65499/65499 [07:15<00:00, 150.44it/s]
100%|██████████| 4945/4945 [02:03<00:00, 40.11it/s]
