In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

## Download and Clean Datasets

In [8]:
#!gdown 1qrrznvcHkyUPoxq4GbauPxcT3mMGILVw
#!gdown 1m_0tZXsqQSxaogvby83B5CRzgJcspFvU
!mv archive.zip raw_data/2020_tweets/
!mv reddit_corpus_unbalanced_filtered.gzip raw_data/factoid_reddit/

### labeled_tweets_georgetown clean

In [27]:
raw_data_path = 'raw_data/labeled_tweets_georgetown/'
output_path = 'data/labeled_tweets_georgetown/'

def determine_stance(row):
    if row['label'] == "NONE":
        return 0
    if (row['label'] == "FAVOR" and row['candidate'] == "Trump") or \
       (row['label'] == "AGAINST" and row['candidate'] == "Biden"):
        return 1
    if (row['label'] == "FAVOR" and row['candidate'] == "Biden") or \
       (row['label'] == "AGAINST" and row['candidate'] == "Trump"):
        return -1

df_train_biden = pd.read_csv(raw_data_path + 'biden_stance_train_public.csv')
df_test_biden = pd.read_csv(raw_data_path + 'biden_stance_test_public.csv')
df_train_trump = pd.read_csv(raw_data_path + 'trump_stance_train_public.csv')
df_test_trump = pd.read_csv(raw_data_path + 'trump_stance_test_public.csv')

df_biden = pd.concat([df_train_biden, df_test_biden])
df_trump = pd.concat([df_train_trump, df_test_trump])

df_biden['candidate'] = 'Biden'
df_trump['candidate'] = 'Trump'

df_biden['stance'] = df_biden.apply(determine_stance, axis=1)
df_trump['stance'] = df_trump.apply(determine_stance, axis=1)

df = pd.concat([df_biden, df_trump])
df = df.drop(columns=['tweet_id', 'label', 'candidate'])

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_df.to_csv(output_path + 'train.csv', index=False)
dev_df.to_csv(output_path + 'dev.csv', index=False)
test_df.to_csv(output_path + 'test.csv', index=False)

### factoid_reddit cleanup

In [2]:
raw_data_path = 'raw_data/factoid_reddit/'
output_path = 'data/factoid_reddit/'
df_raw = pd.read_pickle(raw_data_path + 'reddit_corpus_unbalanced_filtered.gzip', compression='gzip')

columns_to_keep = [
    'documents','pb_factor'
]

df_raw = df_raw[columns_to_keep]
df = pd.DataFrame({
    "text": df_raw["documents"].apply(lambda x: [tup[1] for tup in x]),
    "stance": df_raw["pb_factor"].apply(lambda x: -1 if x < -0.5 else (1 if x > 0.5 else 0))
})
df = df.explode("text").reset_index(drop=True)

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


train_df.to_csv(output_path + 'train.csv', index=False)
dev_df.to_csv(output_path + 'dev.csv', index=False)
test_df.to_csv(output_path + 'test.csv', index=False)

TypeError: issubclass() arg 1 must be a class

### 2020_tweets cleanup

In [6]:
!unzip raw_data/2020_tweets/archive.zip -d raw_data/2020_tweets/
raw_data_path = 'raw_data/2020_tweets/'
output_path = 'data/2020_tweets/'

df_biden = pd.read_csv(raw_data_path + 'hashtag_joebiden.csv', lineterminator='\n',parse_dates=True)
df_trump = pd.read_csv(raw_data_path + 'hashtag_donaldtrump.csv', lineterminator='\n',parse_dates=True)

df_trump['created_at'] = pd.to_datetime(df_trump['created_at'])
df_trump = df_trump.sort_values(by='created_at', ascending=False)
df_trump = df_trump.drop_duplicates(subset='tweet_id', keep='first')

df_biden['created_at'] = pd.to_datetime(df_biden['created_at'])
df_biden = df_biden.sort_values(by='created_at', ascending=False)
df_biden = df_biden.drop_duplicates(subset='tweet_id', keep='first')

df_biden['contains'] = "Biden"
df_trump['contains'] = "Trump"

df = pd.concat([df_biden, df_trump])

df['contains'] = df.groupby('tweet_id')['contains'].transform(
    lambda x: 'Both' if len(set(x)) > 1 else x
)

df = df.sort_values(by='created_at', ascending=False)
df = df.drop_duplicates(subset='tweet_id', keep='first')

df = df.reset_index(drop=True)

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_df.to_csv(output_path + 'train.csv', index=False)
dev_df.to_csv(output_path + 'dev.csv', index=False)
test_df.to_csv(output_path + 'test.csv', index=False)

Archive:  raw_data/2020_tweets/archive.zip
  inflating: raw_data/2020_tweets/hashtag_donaldtrump.csv  
  inflating: raw_data/2020_tweets/hashtag_joebiden.csv  
