In [1]:
import os
import re
import pandas as pd
import numpy as np


In [2]:
job_number = '45075917'
log_dir = 'log/'
job_re = re.compile(r'.*{}.*'.format(job_number))
scan_re = re.compile(r'.*expanded call.*(tfMRI_(?:CARIT)_(?:AP|PA)).*')
PID_re = re.compile(r'.*[_/](HCD.*?)[_/].*')
logfiles = [os.path.join(log_dir, f) for f in os.listdir(log_dir) if job_re.match(f)]

In [3]:
error_strings = {"F-test" : "F-test 2 isn't valid",
                "dtseries" : "ERROR: failed to open file",
                "csv" : "ls: cannot access"}
def error_info(error_strings, f):
    if type(error_strings) not in [dict, list]:
        raise TypeError("error_strings is not a list")
    if type(error_strings) is dict:
        error_strings = list(error_strings.values())
    errors = []
    scan = ''
    PID = ''
    for line in f:
        if(len(PID) is 0):
            PID_match = PID_re.match(line)
            if PID_match:
                PID = PID_match.group(1)
        if(any([e for e in error_strings if e in line])):
            errors.append(line)
        m = scan_re.match(line)
        if m:
            scan = m.group(1)
    return(errors, scan, PID)
                
        
    return()
def error_types(error_strings, errors):
    if type(error_strings) not in [dict, list]:
        raise TypeError("error_strings is not a list")
    if type(error_strings) is dict:
        error_strings = list(error_strings.values())
    error_types = [0, 0, 0]
    for error in errors:
        ii = [i for i, e in enumerate(error_strings) if e in error]
        for i in ii:
            error_types[i] += 1
    return(error_types) 

In [4]:
errors = []
for logfile in logfiles:
    with open(logfile) as f:
        these_errors, scan_with_error, PID_with_error = error_info(error_strings, f)
        these_error_types = error_types(error_strings, these_errors)
        errors.append([ logfile, len(these_errors), these_errors, *these_error_types, scan_with_error, PID_with_error])

In [5]:
df = pd.DataFrame(errors, columns = ['logfile', 'n_errors', 'errors', *error_strings.keys(), 'scan', 'PID'])
df.n_errors = [len(np.unique(err)) for err in df.errors]
df.errors = [np.unique(err.errors)[0] if err.n_errors is 1 else 
 np.nan if err.n_errors is 0 else err.errors for 
 err in df[['errors', 'n_errors']].itertuples() if err.n_errors <= 1]
df[df.n_errors > 0]

Unnamed: 0,logfile,n_errors,errors,F-test,dtseries,csv,scan,PID
2,log/hcp1st-45075917_4.out,1,F-test 2 isn't valid - each included contrast ...,1,0,0,tfMRI_CARIT_PA,HCD0383143
17,log/hcp1st-45075917_37.out,1,F-test 2 isn't valid - each included contrast ...,1,0,0,tfMRI_CARIT_AP,HCD0842250
23,log/hcp1st-45075917_21.out,1,F-test 2 isn't valid - each included contrast ...,1,0,0,tfMRI_CARIT_PA,HCD2400632
27,log/hcp1st-45075917_26.out,1,F-test 2 isn't valid - each included contrast ...,1,0,0,tfMRI_CARIT_AP,HCD0947365
44,log/hcp1st-45075917_23.out,1,F-test 2 isn't valid - each included contrast ...,1,0,0,tfMRI_CARIT_PA,HCD0921347
...,...,...,...,...,...,...,...,...
742,log/hcp1st-45075917_743.out,1,ERROR: failed to open file '/net/holynfs01/srv...,0,3,0,tfMRI_CARIT_PA,HCD2662763
757,log/hcp1st-45075917_758.out,1,F-test 2 isn't valid - each included contrast ...,1,0,0,tfMRI_CARIT_AP,HCD2743157
760,log/hcp1st-45075917_761.out,1,F-test 2 isn't valid - each included contrast ...,1,0,0,tfMRI_CARIT_AP,HCD2751964
798,log/hcp1st-45075917_799.out,1,ls: cannot access /ncf/hcp/data/CCF_HCD_STG_Ps...,0,0,1,,HCD2982276


In [6]:
df[df.n_errors > 0].to_csv('l1_errors.csv')