In [9]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
from gdtm.helpers.common import load_flat_dataset
base_dir = os.getenv('BASEDIR')
import re
import pickle as pk

In [10]:
from tqdm import tqdm
tqdm.pandas()

### Qanda

In [11]:
data = pd.read_csv(os.path.join(base_dir,'data','01_raw_data','qanda','qanda_episodes.csv'), dtype=str)

data['hashtags'] = data['hashtags'].fillna('').apply(lambda s: s.split(';;;'))

data['mentions'] = data['mentions'].fillna('').apply(lambda s: s.split(';;;'))

data['urls'] = data['urls'].fillna('').apply(lambda s: s.split(';;;'))

data['text_ht_censored'] = data['text'].apply(lambda t: re.sub(r'http\S+', '<URL>', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","<HASHTAG>", t)) 
data['text'] = data['text'].apply(lambda t: re.sub(r'http\S+', '', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","", t)).apply(lambda t: re.sub(r'@\S+', '', t))

data['rid'] = data['rid'].combine_first(data['tid'])

In [12]:
with open(os.path.join(base_dir,'data','01_raw_data','qanda','qanda_per_post.pk'), 'wb') as wf:
    pk.dump(data, wf)

In [13]:
data_per_episode = data.groupby(['uid','episode']).progress_apply(lambda d: pd.DataFrame({'text' : ' '.join(d['text']),'hashtags': [[e for u in d['hashtags'] for e in u if e != '']],'rid':[list(d['rid'])], 'urls' : [[e for u in d['urls'] for e in u if e != '']] }))

100%|██████████| 275032/275032 [02:25<00:00, 1887.18it/s]


In [14]:
with open(os.path.join(base_dir,'data','01_raw_data','qanda','qanda_per_episode.pk'), 'wb') as wf:
    pk.dump(data_per_episode, wf)

In [15]:
data_per_user = data.groupby('uid').progress_apply(lambda d: pd.DataFrame({'text' : ' '.join(d['text']),'hashtags': [[e for u in d['hashtags'] for e in u if e != '']],'rid':[list(d['rid'])], 'urls' : [[e for u in d['urls'] for e in u if e != '']] }))

100%|██████████| 100114/100114 [00:53<00:00, 1883.24it/s]


In [16]:
with open(os.path.join(base_dir,'data','01_raw_data','qanda','qanda_per_user.pk'), 'wb') as wf:
    pk.dump(data_per_user ,wf)

### Ausvotes

In [17]:
data = pd.read_csv(os.path.join(base_dir,'data','01_raw_data','ausvotes','ausvotes.csv'), dtype=str, header=None, names=['tid','cid','uid','created_at','text','urls', 'urls2'])

data['urls'] = data['urls'].fillna('').apply(lambda s: s.split(';;;')) + data['urls2'].fillna('').apply(lambda s: s.split(';;;'))
data = data[~data.text.isna()]
data['text'] = data['text'].apply(lambda t: re.sub(r'http\S+', '', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","", t)).apply(lambda t: re.sub(r'@\S+', '', t))
data['cid'] = data['cid'].combine_first(data['tid'])

In [18]:
with open(os.path.join(base_dir,'data','01_raw_data','ausvotes','ausvotes_per_post.pk'), 'wb') as wf:
    pk.dump(data, wf)

In [19]:
data_per_user = data.groupby('uid').progress_apply(lambda d: pd.DataFrame({'text' : ' '.join(d['text']), 'rid':[list(d['cid'])], 'urls' : [[e for u in d['urls'] for e in u if e != '']] }))

100%|██████████| 265350/265350 [02:16<00:00, 1943.76it/s]


In [20]:
with open(os.path.join(base_dir,'data','01_raw_data','ausvotes','ausvotes_per_user.pk'), 'wb') as wf:
    pk.dump(data_per_user, wf)

### Social Sense

In [27]:
fb_data = pd.read_csv(os.path.join(base_dir,'data','01_raw_data','socialsense','fb_for_stance.csv'), dtype=str)[['index','text','text_urls', 'timestamp', 'user']]
fb_data['urls'] = fb_data['text_urls'].fillna('').apply(lambda s: s.strip().split(','))
fb_data['text'] = fb_data['text'].apply(lambda t: re.sub(r'http\S+', '', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","", t)).apply(lambda t: re.sub(r'@\S+', '', t))
fb_data['id'] = fb_data['index']

fb_data['uid'] = fb_data['user']
fb_data['created_at'] = fb_data['timestamp']
fb_data['is_twitter'] = False

fb_data = fb_data[['id', 'text','urls', 'uid', 'created_at', 'is_twitter']]

In [28]:
id_uid_index = pd.read_csv(os.path.join(base_dir,'data','01_raw_data','socialsense','socialsense_id_uid_index.csv'), dtype=str, header=None, names=['id', 'cid','uid'])[['id','uid']]

In [29]:
tw_data = pd.read_csv(os.path.join(base_dir,'data','01_raw_data','socialsense','tw_for_stance_unrolled.csv'), dtype=str)[['id', 'text','text_urls_unrolled', 'date']]
tw_data = pd.merge(tw_data,id_uid_index, how='left', on='id')
tw_data['urls'] = tw_data['text_urls_unrolled'].fillna('').apply(lambda s: s.strip().split(','))
tw_data['text'] = tw_data['text'].apply(lambda t: re.sub(r'http\S+', '', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","", t)).apply(lambda t: re.sub(r'@\S+', '', t))

tw_data['created_at'] = tw_data['date']
tw_data['is_twitter'] = True

tw_data = tw_data[['id', 'text','urls', 'uid', 'created_at', 'is_twitter']]

In [30]:
data = pd.concat([tw_data, fb_data])

In [32]:
with open(os.path.join(base_dir,'data','01_raw_data','socialsense','socialsense_per_post.pk'), 'wb') as wf:
    pk.dump(data, wf)

In [38]:
data_filtered = data[~data['uid'].isna()]

In [39]:
data_per_user = data_filtered.groupby('uid').progress_apply(lambda d: pd.DataFrame({'text' : ' '.join(d['text']), 'urls' : [[e for u in d['urls'] for e in u if e != '']] }))

100%|██████████| 49442/49442 [00:19<00:00, 2486.97it/s]


In [40]:
with open(os.path.join(base_dir,'data','01_raw_data','socialsense','socialsense_per_user.pk'), 'wb') as wf:
    pk.dump(data_per_user, wf)