In [9]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import csv

In [2]:
# Process weird date formats 
def normalize_dates(date):
    try:
        return datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    except:
        return datetime.strptime(date[:-7], '%Y-%m-%d %H:%M:%S')

In [3]:
# This contains ALL our reporting experiment data, but includes some invalid data in 2019 due to faulty thresholds 
# filepath = "/data/databases/erroneous_reporting_experiment_data.csv"

# This is our main data-set
filepath = "/data/databases/reporting_experiment.csv"

df = pd.read_csv(filepath)

In [4]:
df.shape[0], df.dropna().shape[0]

(250423, 17767)

In [5]:
df['created'] = df['created_utc'].apply(normalize_dates)
df['ingested'] = df['ingested_utc'].apply(normalize_dates)

In [7]:
print("Total number of comments ingested by Crossmod during the experiment = ", df.shape[0])
print("Total number of comments reported by Crossmod during the experiment = ", df[df.crossmod_action == "report"].shape[0])
print("% of comments reported by Crossmod during the experiment = ", 100.0 * df[df.crossmod_action == "report"].shape[0]/df.shape[0], "%")
print("Total number of comments removed by moderators during the experiment = ", df[df.banned_by.notna()].shape[0])
print("% of comments removed by moderators during the experiment = ", 100.0 * df[df.banned_by.notna()].shape[0]/df.shape[0], "%")

Total number of comments ingested by Crossmod during the experiment =  250423
Total number of comments reported by Crossmod during the experiment =  2686
% of comments reported by Crossmod during the experiment =  1.0725851858655155 %
Total number of comments removed by moderators during the experiment =  17767
% of comments removed by moderators during the experiment =  7.0947956058349275 %


In [44]:
# Threads, combined text CSV
COMMENT_DELIMITER = "||"
output_filepath = "/data/databases/threads.csv"
csv_file = open(output_filepath, 'w')
csv_writer = csv.writer(csv_file)
header = \
[
    "total_comments_in_thread",
    "total_moderator_removed",
    "automoderator_removed",
    "human_moderator_removed",
    "crossmod_removed",
    "combined_text",  
    "parent_id",
    "link_id"
]
csv_writer.writerow(header)

thread_values = []
threads_collected = 0

# Group by link_id to gather comments for the same post together
for name, group in df.groupby('parent_id'):
        
    discarding_zeros = list(filter(lambda x: x > 0, group['agreement_score']))
    
    # discard if all the comments are filtered (agreement score -1.0)
    if len(discarding_zeros) == 0:
        continue

    total_thread = len(group)
    
    if total_thread < 5:
        continue
    moderator_removed = len(group[group.banned_by.notna()])
    automoderator_removed = len(group[group.banned_by == "AutoModerator"])
    human_moderator_removed =  moderator_removed - automoderator_removed
    crossmod_removed = len(group[group.crossmod_action == "report"])
    combined_text = COMMENT_DELIMITER.join(group["body"]).replace('\n',' ')
    parent_id = group['parent_id'].array[0]
    link_id = group['link_id'].array[0]
    thread = \
    [
      total_thread,
      moderator_removed,
      automoderator_removed,
      human_moderator_removed,
      crossmod_removed,
      combined_text,
      parent_id,
      link_id
    ]
    csv_writer.writerow(thread)
    thread_values.append(thread)
    threads_collected += 1
    if (threads_collected % 100 == 0):
        print("Currently processed", threads_collected, " discussion threads")



Currently processed 100  discussion threads
Currently processed 200  discussion threads
Currently processed 300  discussion threads
Currently processed 400  discussion threads
Currently processed 500  discussion threads
Currently processed 600  discussion threads
Currently processed 700  discussion threads
Currently processed 800  discussion threads
Currently processed 900  discussion threads
Currently processed 1000  discussion threads
Currently processed 1100  discussion threads
Currently processed 1200  discussion threads
Currently processed 1300  discussion threads
Currently processed 1400  discussion threads
Currently processed 1500  discussion threads
Currently processed 1600  discussion threads
Currently processed 1700  discussion threads
Currently processed 1800  discussion threads
Currently processed 1900  discussion threads
Currently processed 2000  discussion threads
Currently processed 2100  discussion threads
Currently processed 2200  discussion threads
Currently processed