In [None]:
import pandas
import json
import re
from os.path import join, isdir, exists
from os import mkdir, rmdir

In [None]:
trump_df_name = "../DT_data.csv"
hillary_df_name = "../HC_data.csv"

trump_df = pandas.read_csv(trump_df_name)
hillary_df = pandas.read_csv(hillary_df_name)

In [None]:
SEQ_LENGTH = 75

In [None]:
def write_tweets(candidate, df, column, folder, file_type):

  if not isdir(folder):
    mkdir(folder)

  for i in range(df[column].shape[0]):
    tweets = json.loads(re.sub('\'', '\"',  df[column][i]))
    day = df["DATE"][i]
    category = column.lower()
    index = 1
    for tweet in tweets:
      article_name = candidate + "_" + category + "_" + str(i) + "_" + str(index) + "." + file_type
      index += 1
      with open(join(folder, article_name), "w") as f:

#### Write in aschern processing format
        if file_type == "txt":
          f.write(tweet["text"])
          f.write("\n\n")
#### Write JSON
        elif file_type == "json":
          tweet["date"] = day
          json.dump(tweet, f)

In [None]:
def convert_token_to_character_number(tokens, token, is_start):
  if token == 0:
    return 0
  if is_start:
    return sum(map(lambda x: len(x), tokens[0:token])) + token #Account for spaces
  else:
    return sum(map(lambda x: len(x), tokens[0:token])) + token-1 #Account for spaces

In [None]:
def write_non_tweets(candidate, df, non_tweet_folder, file_type, aschern=False):

  for i in range(df["NON_TWEETS"].shape[0]):
    if df["NON_TWEETS"][i] == []:
      continue
    category = df["CONTENT_CATEGORY"][i]
    if type(category) == float or not category or category == "tweet":
      continue
    folder = non_tweet_folder
    if not aschern:
      folder = join(non_tweet_folder, category)
    if not isdir(folder):
      mkdir(folder)
    article_name = candidate + "_" + category + "_" + str(i) + "." + file_type
    with open(folder + "/" + article_name, "w") as f:

#### Write in aschern processing format
      if file_type == "txt":
        meta_article_name = candidate + "_" + category + "_" + str(i) + ".csv"
        meta_article_path = join(folder, "..", "meta")
        Path(meta_article_path).mkdir(parents=True, exist_ok=True)

        with open(join(meta_article_path, meta_article_name), "w") as mf:

          meta_df = pandas.DataFrame(columns=["sentence", "start", "end"])

          #Get the entire original article
          text = df["NON_TWEETS"][i]
          #Split by sentences
          sentences = re.split('\.\s', text)
          #Find token spans and store for each sentence, store tokens in token list
          index, token_spans, tokens = 0, [], []
          for sentence in sentences:
            s = sentence.split()
            token_spans.append((index, index+len(s)))
            tokens.extend(s)
            index += len(s)
          
          #Sentences = actual sentence, start and end token level span
          sentences = list(map(lambda x,y: (x,y[0],y[1]), sentences, token_spans))

          #Store the character position with respect to padded file to use for token level span calc
          position = 0
          for sentence in sentences:
            if len(sentence[0]) <= 1:
              continue

            #Get token level start and end in padded sentence
            token_start, token_end = get_nearest_token_span(SEQ_LENGTH, sentence[1], sentence[2], tokens)

            #Get character level end points for padded sentences, and the offsets of these sentences
            start_pad = convert_token_to_character_number(tokens, token_start, is_start=True)
            end_pad = convert_token_to_character_number(tokens, token_end, is_start=False)
            start = convert_token_to_character_number(tokens, sentence[1], is_start=True)
            end = convert_token_to_character_number(tokens, sentence[2], is_start=False)

            meta_data_content = (str(sentence[0]), str(position+(start-start_pad)), str(position+(end-start_pad)))          
            # mf.write(", ".join(meta_data_content) + "\n")
            meta_df = meta_df.append({"sentence": meta_data_content[0], "start": meta_data_content[1], "end": meta_data_content[2]}, ignore_index=True)
            f.write(" ".join(tokens[token_start:token_end]) + "\n")

            position += (end_pad-start_pad + 1)
          meta_df.to_csv(join(meta_article_path, meta_article_name))
          f.write("\n\n")

#### Write JSON
      elif file_type == "json":
        data = {"text": df["NON_TWEETS"][i], "date": df["DATE"][i]}
        json.dump(data, f)
      
    # print("Processed file: " + str(i))

### Check that meta data matches feedforward data

In [None]:
def span_checksum(feedforward_file_path, meta_file_path, total, to_long):
  with open(feedforward_file_path, "r") as ff:
    feed_forward_file_contents = "".join([line for line in ff])
    df = pandas.read_csv(meta_file_path)
    for i in range(df.shape[0]):
      total += 1
      
      line = df["sentence"][i]
      start = df["start"][i]
      end = df["end"][i]
      
      start, end = int(start), int(end)
      line = line.split()
      if len(line) > 75:
        to_long += 1
        continue
      line = ' '.join(line)
      try:
        assert(feed_forward_file_contents[start:end].strip() == line)
      except:
        print(feedforward_file_path,end="")
        print("\nMeta data line: ")
        print(line)
        print("\n\nFeed forward file associated line: ")
        print(feed_forward_file_contents[start:end])
        raise Exception()
  return total, to_long

In [None]:
def get_articles_and_meta_path(folder, df, candidate):

  ff_files, meta_files = [], []
  for i in range(df["NON_TWEETS"].shape[0]):
    
    category = df["CONTENT_CATEGORY"][i]
    
    if category == "tweet" or category != category:
      continue

    article_name = candidate + "_" + category + "_" + str(i) + ".txt"
    ff_file_path = folder + "/" + article_name
    meta_file_name = candidate + "_" + category + "_" + str(i) + ".csv"
    meta_file_folder = join(folder, "..", "meta")
    meta_file_path = join(meta_file_folder, meta_file_name)
    
    ff_files.append(ff_file_path)
    meta_files.append(meta_file_path)

  return ff_files, meta_files

In [None]:
def check_files(candidate, df, folder):
  total, to_long = 0, 0
  ff_files, meta_files = get_articles_and_meta_path(folder, df, candidate)
  for i in range(len(ff_files)):
    
    ff_file_path = ff_files[i]
    meta_file_path = meta_files[i]

    assert(exists(ff_file_path))
    assert(exists(meta_file_path))

    # try:
    total, to_long = span_checksum(ff_file_path, meta_file_path, total, to_long)
    # except:
    #   return total, to_long
  print("passed...")
  return total, to_long

In [None]:
check_files("HC", hillary_df, "../aschern_data")
check_files("DT", trump_df, "../aschern_data")

In [None]:
def get_nearest_token_span(seq_length, start, end, tokens):

  seg_start, seg_end = 0, 0

  span_length = end-start
  left_or_right = (seq_length - span_length) // 2
  at_left_edge = start-left_or_right < 0
  at_right_edge = end+left_or_right > len(tokens)

  if at_left_edge:
    right_dist = seq_length - start - span_length
    seg_end = min(len(tokens), end+right_dist)
    seg_start = 0
  elif at_right_edge:
    left_dist = seq_length - ((len(tokens) - end) + span_length)
    seg_start = max(0, start - left_dist)
    seg_end = len(tokens)
  else:
    seg_start = start-left_or_right
    seg_end = end + left_or_right
  
  return seg_start, seg_end

In [None]:
def remove_dir(directory):
  directory = Path(directory)
  if not directory.is_dir():
    return
  for item in directory.iterdir():
      if item.is_dir():
        rmdir(item)
      else:
        item.unlink()
  directory.rmdir()

In [None]:
from pathlib import Path

base_folder = "../2016_election_data"

dt_directory = "donald_trump_campaign"
hc_directory = "hillary_clinton_campaign"

remove_dir(join("../2016_election_data", dt_directory))
remove_dir(join("../2016_election_data", hc_directory))
remove_dir("../aschern_data/meta")
remove_dir("../aschern_data")

Path(join(base_folder, dt_directory, "donald_trump")).mkdir(parents=True, exist_ok=True)
Path(join(base_folder, hc_directory, "hillary_clinton")).mkdir(parents=True, exist_ok=True)

# Create folder structure with JSON documents

In [None]:
write_tweets("HC", hillary_df, "TWEETS", join(base_folder, "hillary_clinton_campaign/hillary_clinton/tweets"), "json")
write_tweets("HC", hillary_df, "RETWEETS", join(base_folder, "hillary_clinton_campaign/hillary_clinton/retweets"), "json")
write_tweets("DT", trump_df, "TWEETS", join(base_folder, "donald_trump_campaign/donald_trump/tweets"), "json")
write_tweets("DT", trump_df, "RETWEETS", join(base_folder, "donald_trump_campaign/donald_trump/retweets"), "json")

In [None]:
write_non_tweets("HC", hillary_df, join(base_folder, "hillary_clinton_campaign/hillary_clinton"), "json")
write_non_tweets("DT", trump_df, join(base_folder, "donald_trump_campaign/donald_trump"), "json")

# Create folder structure for aschern model

In [None]:
write_tweets("HC", hillary_df, "TWEETS", "../aschern_data", "txt")
write_tweets("HC", hillary_df, "RETWEETS", "../aschern_data", "txt")
write_tweets("DT", trump_df, "TWEETS", "../aschern_data", "txt")
write_tweets("DT", trump_df, "RETWEETS", "../aschern_data", "txt")

In [None]:
write_non_tweets("HC", hillary_df, "../aschern_data", "txt", aschern=True)
write_non_tweets("DT", trump_df, "../aschern_data", "txt", aschern=True)

In [None]:
check_files("HC", hillary_df, "../aschern_data")
check_files("DT", trump_df, "../aschern_data")

# Create propaganda vector dataset

In [349]:
PROP_TECHS = [
  "Loaded_Language",
  "Name_Calling,Labeling",
  "Repetition",
  "Exaggeration,Minimisation",
  "Doubt",
  "Appeal_to_fear-prejudice",
  "Flag-Waving",
  "Causal_Oversimplification",
  "Slogans",
  "Appeal_to_Authority",
  "Black-and-White_Fallacy",
  "Thought-terminating Cliches",
  "Whataboutism,Straw_Men,Red_Herring",
  "Obfuscation,Intentional_Vagueness,Confusion"]

In [356]:
NUM_FINAL_RESULTS=32

root_folder = "../result_techniques"

vector_df = pandas.DataFrame(columns=["File Name", "Candidate", "Sentence", "Propaganda Segment"] + PROP_TECHS)

for i in range(NUM_FINAL_RESULTS):
  result_file = join(root_folder, str(i+1)+"_final_tc_results.txt")
  with open(result_file, "r") as rf:
    rf.readline()
    #Extract propaganda techniques and documents
    for line in rf:
      line = line.split()
      file_name = line[0]
      prop_start = line[1]
      prop_end = line[2]
      prop_line = " ".join(line[3:len(line)-1])
      prop_technique = line[len(line)-1]

      meta_file_path = join("..","meta",file_name.split(".")[0]+".csv")
      meta_df = pandas.read_csv(meta_file_path)
      vector = [0]*14
      
      for j in range(meta_df.shape[0]):
        sentence_info = meta_df.iloc[j]
        if int(sentence_info["start"]) <= int(prop_start) <= int(sentence_info["end"]):
          #Check if we've already found propaganda in this sentence
          if (vector_df["Sentence"] == sentence_info["sentence"]).any():
            vector_df.loc[vector_df["Sentence"] == sentence_info["sentence"],"Propaganda Segments"] += ", " + prop_line + ": " + prop_technique
            vector_df.loc[vector_df["Sentence"] == sentence_info["sentence"],"Num Prop"] += 1
            vector_df.loc[vector_df["Sentence"] == sentence_info["sentence"],prop_technique] += 1
          else:
            vector_df = vector_df.append(
              {"Candidate": file_name[0:2],
              "File Name": file_name, 
              "Sentence": sentence_info["sentence"],
              "Propaganda Segments": prop_line + ": " + prop_technique,
              "Num Prop": 1,
              prop_technique: 1}, ignore_index=True)
          break
        elif int(sentence_info["start"]) > int(prop_start):
          break 
  print("Processed " + str(i+1), end="...")
vector_df.to_csv("classifier_training_data.csv")

Processed 1...

FileNotFoundError: [Errno 2] File ../meta/DT_retweets_105_2.csv does not exist: '../meta/DT_retweets_105_2.csv'