In [1]:
import pandas as pd
import os
import re
import copy
import datetime

# Collect data trails from audit bots

In [2]:
strategies = ['delete', 'dislike', 'dislike-recommendation', 
              'no-channel', 'none', 'not-interested', 'watch']
topics = ['alt-right', 'antitheist', 'politics-left', 'politics-right', 'random']
notes = [0, 1, 2, 3, 4]

In [3]:
# find a file in a directory: https://stackoverflow.com/questions/1724693/find-a-file-in-python
import os, fnmatch
def find(pattern, path):
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if fnmatch.fnmatch(name, pattern):
                result.append(os.path.join(root, name))
    return result

In [4]:
# From file name ('./time/filename') to the number of seconds
# re functions: https://docs.python.org/3/library/re.html#functions
# extract substring: https://note.nkmk.me/en/python-str-extract/
# String to datetime instance: https://docs.python.org/3/library/datetime.html
# seconds since start of day: https://stackoverflow.com/questions/15971308/get-seconds-since-midnight-in-python

def f_to_secs(f):
    time = datetime.datetime.strptime(re.findall('.*/(.*)/.*', f)[0], '%H:%M:%S')
    total_seconds = (time - time.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds()
    return total_seconds

In [5]:
# for a given topic, scrubbing strategy, and note, find corresponding reruns and combine them into a dataframe
# each "rerun" represents a time where the audit failed midway through and I had to reset it to run and pickup at the point it failed at
def process(t, s, n):
    role = '{0}_{1}_{2}'.format(t,s,n)
    #print(role)
    fs = find('*{0}*.csv'.format(role), './{0}'.format(t))
    #print(fs)
    fs = sorted(fs, key= lambda f: f_to_secs(f))
    
    rerun_dfs = []
    rerun = 0
    for file_count in range(len(fs)):
        f = fs[file_count]
        
        try:
            df = pd.read_csv(f)
            
            # append attributes about the bot
            df['topic'] = [t for i in range(df.shape[0])]
            df['strategy'] = [s for i in range(df.shape[0])]
            df['note'] = [n for i in range(df.shape[0])]
            df['rerun'] = [rerun for i in range(df.shape[0])]
            
            # Eliminate scrub as well as videopage experiment phase 2 on rerun #0 because they're captured in rerun #1
            # (sometimes for non-fatal errors, the bot carried on)
            # All errors occured after scrub start, which is why this works
            if rerun == 1 and t!='politics-left' and strategy!='no-channel' and note!=1:
                run_df = rerun_dfs[0]
                run_df = run_df[
                    ~((run_df.rerun==0) & 
                      ((run_df.phase=='scrub') | 
                       ((run_df.phase=='videopage_experiment') & (run_df.phase_level==2))
                      )
                     )
                ]
                rerun_dfs[0] = run_df
            
            if rerun > 1:
                print('ahh')
            
            rerun += 1
            rerun_dfs.append(df)
        
        #print('{0}: {1}'.format(role, df.shape))
        
        # empty data error: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.errors.EmptyDataError.html
        # happens when we try to read csv from the chrome cannot connect issue- it's empty!
        except pd.errors.EmptyDataError as ede:
            continue
            
    return rerun_dfs

In [6]:
dfs = []
for topic in topics:
    for strategy in strategies:
        for note in notes:
            process_dfs = process(topic, strategy, note)
            dfs += process_dfs

In [7]:
all_df = pd.concat(dfs)
all_df.shape

(495852, 15)

In [8]:
pd.set_option('display.max_rows', 500)


In [9]:
all_df[all_df.component=='homepage'].groupby(['topic','strategy','note','phase','rerun'])['homepage_level'].agg(['min','max', 'count'])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,min,max,count
topic,strategy,note,phase,rerun,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
alt-right,delete,0,scrub,0,40,79,856
alt-right,delete,0,stain,0,0,39,886
alt-right,delete,1,scrub,0,40,79,881
alt-right,delete,1,stain,0,0,39,834
alt-right,delete,2,scrub,1,40,79,861
alt-right,delete,2,stain,0,0,39,861
alt-right,delete,3,scrub,0,40,79,827
alt-right,delete,3,stain,0,0,39,831
alt-right,delete,4,scrub,0,40,79,774
alt-right,delete,4,stain,0,0,39,767


In [10]:
all_df.groupby(['topic','strategy','note','phase','rerun'])['homepage_level'].agg(['min','max', 'count'])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,min,max,count
topic,strategy,note,phase,rerun,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
alt-right,delete,0,scrub,0,40,79,856
alt-right,delete,0,stain,0,0,40,1682
alt-right,delete,0,videopage_experiment,0,0,80,60
alt-right,delete,1,scrub,0,40,79,881
alt-right,delete,1,stain,0,0,40,1632
...,...,...,...,...,...,...,...
random,watch,3,stain,0,0,40,1542
random,watch,3,videopage_experiment,0,0,80,60
random,watch,4,scrub,0,40,80,1686
random,watch,4,stain,0,0,40,1507


In [12]:
all_df.to_csv('./all_cleaned.csv', index=False)