In [16]:
import pandas
import json
import re
from os.path import join, isdir
from os import mkdir, rmdir

In [17]:
trump_df_name = "../DT_data.csv"
hillary_df_name = "../HC_data.csv"

trump_df = pandas.read_csv(trump_df_name)
hillary_df = pandas.read_csv(hillary_df_name)

In [18]:
SEQ_LENGTH = 412

def write_tweets(candidate, df, column, folder, file_type):

  if not isdir(folder):
    mkdir(folder)

  for i in range(df[column].shape[0]):
    tweets = json.loads(re.sub('\'', '\"',  df[column][i]))
    day = df["DATE"][i]
    category = column.lower()
    index = 1
    for tweet in tweets:
      article_name = candidate + "_" + category + "_" + str(i) + "_" + str(index) + "." + file_type
      index += 1
      with open(join(folder, article_name), "w") as f:

#### Write in aschern processing format
        if file_type == "txt":
          f.write(tweet["text"])
          f.write("\n\n")
#### Write JSON
        elif file_type == "json":
          tweet["date"] = day
          json.dump(tweet, f)

def write_non_tweets(candidate, df, non_tweet_folder, file_type, aschern=False):
  for i in range(df["NON_TWEETS"].shape[0]):
    if df["NON_TWEETS"][i] == []:
      continue
    category = df["CONTENT_CATEGORY"][i]
    if type(category) == float or not category or category == "tweet":
      continue
    folder = non_tweet_folder
    if not aschern:
      folder = join(non_tweet_folder, category)
    if not isdir(folder):
      mkdir(folder)
    article_name = candidate + "_" + category + "_" + str(i) + "." + file_type
    with open(folder + "/" + article_name, "w") as f:

#### Write in aschern processing format
      if file_type == "txt":
        #TODO: SPLIT BY SENTENCES "/s, ./s ...
        meta_article_name = candidate + "_" + category + "_" + str(i) + ".csv"
        meta_article_path = join(folder, "..", "meta")
        Path(meta_article_path).mkdir(parents=True, exist_ok=True)

        with open(join(meta_article_path, meta_article_name), "w") as mf:
          
          text = df["NON_TWEETS"][i]
          sentences = re.split('\.\s', text)
          index = 0
          spans = []
          for sentence in sentences:
            num_tokens = len(sentence.split())
            spans.append((index, index+num_tokens))
            index += num_tokens
          tokens = [token for sentence in sentences for token in sentence.split()]
          sentences = list(map(lambda x,y: (x,y[0],y[1]), sentences, spans))
          for sentence in sentences:
            if len(sentence[0]) < 1:
              continue
            token_start, token_end = get_nearest_token_span(SEQ_LENGTH, sentence[1], sentence[2], tokens)
            f.write(" ".join(tokens[token_start:token_end]))
            f.write("\n")
            mf.write(", ".join(map(lambda x: str(x), sentence)) + "\n")
          mf.write("\n\n")

#### Write JSON
      elif file_type == "json":
        data = {"text": df["NON_TWEETS"][i], "date": df["DATE"][i]}
        json.dump(data, f)
      
    # print("Processed file: " + str(i))

In [19]:
def get_nearest_token_span(seq_length, start, end, tokens):

  seg_start, seg_end = 0, 0

  span_length = end-start
  left_or_right = (seq_length - span_length) // 2
  at_left_edge = start-left_or_right < 0
  at_right_edge = end+left_or_right > len(tokens)

  if at_left_edge:
    right_dist = seq_length - start - span_length
    seg_end = min(len(tokens), end+right_dist)
    seg_start = 0
  elif at_right_edge:
    left_dist = seq_length - ((len(tokens) - end) + span_length)
    seg_start = max(0, start - left_dist)
    seg_end = len(tokens)
  else:
    seg_start = start-left_or_right
    seg_end = end + left_or_right
  
  return seg_start, seg_end

In [20]:
def remove_dir(directory):
  directory = Path(directory)
  if not directory.is_dir():
    return
  for item in directory.iterdir():
      if item.is_dir():
        rmdir(item)
      else:
        item.unlink()
  directory.rmdir()

In [22]:
from pathlib import Path

base_folder = "../2016_election_data"

dt_directory = "donald_trump_campaign"
hc_directory = "hillary_clinton_campaign"

remove_dir(join("../2016_election_data", dt_directory))
remove_dir(join("../2016_election_data", hc_directory))
remove_dir("../aschern_data/meta")
remove_dir("../aschern_data")

Path(join(base_folder, dt_directory, "donald_trump")).mkdir(parents=True, exist_ok=True)
Path(join(base_folder, hc_directory, "hillary_clinton")).mkdir(parents=True, exist_ok=True)

# Create folder structure with JSON documents

In [6]:
write_tweets("HC", hillary_df, "TWEETS", join(base_folder, "hillary_clinton_campaign/hillary_clinton/tweets"), "json")
write_tweets("HC", hillary_df, "RETWEETS", join(base_folder, "hillary_clinton_campaign/hillary_clinton/retweets"), "json")
write_tweets("DT", trump_df, "TWEETS", join(base_folder, "donald_trump_campaign/donald_trump/tweets"), "json")
write_tweets("DT", trump_df, "RETWEETS", join(base_folder, "donald_trump_campaign/donald_trump/retweets"), "json")

In [28]:
write_non_tweets("HC", hillary_df, join(base_folder, "hillary_clinton_campaign/hillary_clinton"), "json")
write_non_tweets("DT", trump_df, join(base_folder, "donald_trump_campaign/donald_trump"), "json")

# Create folder structure for aschern model

In [23]:
write_tweets("HC", hillary_df, "TWEETS", "../aschern_data", "txt")
write_tweets("HC", hillary_df, "RETWEETS", "../aschern_data", "txt")
write_tweets("DT", trump_df, "TWEETS", "../aschern_data", "txt")
write_tweets("DT", trump_df, "RETWEETS", "../aschern_data", "txt")

In [24]:
write_non_tweets("HC", hillary_df, "../aschern_data", "txt", aschern=True)
write_non_tweets("DT", trump_df, "../aschern_data", "txt", aschern=True)