# Load dataset

In [None]:
# Upload 'parsed_questions_benepar.csv' to run
import pandas as pd

parsed_questions = pd.read_csv('parsed_questions_benepar.csv')
parsed_questions

Unnamed: 0,Contestant,Question,Parse
0,Aaron Gold,"According to Good Housekeeping, most homeowner...",(S (PP (VBG According) (PP (IN to) (NP (NNP Go...
1,Aaron Gold,"""There is a Sucker Born Ev'ry Minute"" and ""Joi...","(S (NP (`` "") (S (NP (EX There)) (VP (VBZ is) ..."
2,Aaron Gold,"Until it adopted the euro in 2002, fittingly, ...",(SBARQ (SBAR (IN Until) (S (NP (PRP it)) (VP (...
3,Aaron Gold,Which of these is the correct spelling of what...,(SBARQ (WHNP (WHNP (WDT Which)) (PP (IN of) (N...
4,Aaron Hauck,"In 2006, the WHO reported that thousands of pe...","(S (PP (IN In) (NP (CD 2006))) (, ,) (NP (DT t..."
...,...,...,...
24790,Zeke Spector,"As Crayola's website explains, ""while artists ...",(S (SBAR (IN As) (S (NP (NP (NNP Crayola) (POS...
24791,Zeke Spector,"Born Amethyst Amelia Kelly, what recording art...",(SBARQ (S (VP (VBN Born) (NP (NNP Amethyst) (N...
24792,Zeke Spector,Which one of these things is NOT true of the p...,(SBARQ (WHNP (NP (WDT Which) (CD one)) (PP (IN...
24793,Zeke Spector,Since in 1992 it was ruled that anyone in the ...,(S (SBAR (IN Since) (S (PP (IN in) (NP (CD 199...


In [None]:
# Take the parse strings from the csv file and turn them into nltk tree objects
from nltk.tree import *

trees = [Tree.fromstring(tree) for tree in parsed_questions['Parse'].tolist()]

In [None]:
# Take a look at a particular tree
trees[415].pretty_print()

                             SBARQ                                    
  _____________________________|____________________________________   
 |                             SQ                                   | 
 |     ________________________|________                            |  
 |    |    |                            VP                          | 
 |    |    |        ____________________|_______                    |  
 |    |    |       |           |                S                   | 
 |    |    |       |           |                |                   |  
 |    |    |       |           |                VP                  | 
 |    |    |       |           |             ___|____               |  
 |    |    |       |           PP           |        VP             | 
 |    |    |       |      _____|____        |    ____|______        |  
WHNP  |    NP      |     |          NP      |   |           NP      | 
 |    |    |       |     |      ____|___    |   |           |       |  

# Basic relations

In [None]:
def dominates(pos1, pos2):
    len1, len2 = len(pos1), len(pos2)
    if len2 > len1 and pos2[:len1] == pos1:
        return True
    else:
        return False

def ccommands(pos1, pos2):
    len1, len2 = len(pos1), len(pos2)
    strictness = False if len1==len2 else True
    if pos1!=pos2 and not dominates(pos1,pos2) and len2 >= len1 and pos2[:len1-1] == pos1[:len1-1] and strictness:
        return True
    else:
        return False

def is_higher_than(pos1, pos2):
    return dominates(pos1, pos2) or ccommands(pos1, pos2)

def get_ancestors(tree, pos):
    ancestors = [pos[:i-1] for i in range(len(pos))]
    return ancestors

def get_descendants(tree, pos):
    positions = [position for position in tree.treepositions() if isinstance(tree[position], Tree)]
    descendants = [position for position in positions if len(position) > len(pos) and position[:len(pos)]==pos]
    return descendants

def get_mother(tree, pos):
    mother = pos[:len(pos)-1]
    return mother

def get_siblings(tree, pos):
    positions = [position for position in tree.treepositions() if isinstance(tree[position], Tree)]
    siblings = [position for position in positions if len(position)==len(pos) and get_mother(tree,position)==get_mother(tree,pos) and position!=pos]
    return siblings

def get_daughters(tree, pos):
    positions = [position for position in tree.treepositions() if isinstance(tree[position], Tree)]
    daughters = [position for position in positions if len(position) == len(pos)+1 and position[:len(pos)]==pos]
    return daughters

In [None]:
# We will illustrate these relations with a test tree
tree = trees[76]
tree.pretty_print()

                          SBARQ                        
            ________________|________________________   
          WHNP                        SQ             | 
   ________|_____                     |              |  
  |              PP                   VP             | 
  |     _________|____             ___|___           |  
 WHNP  |              NP          |       NP         | 
  |    |          ____|_____      |    ___|____      |  
 WDT   IN        DT        NNS   VBZ  DT       JJ    . 
  |    |         |          |     |   |        |     |  
Which  of      these     animals  is  a      simian  ? 



In [None]:
# Illustrating the 'dominates' relation
PP = (0,1)
for pos in tree.treepositions():
    if dominates(PP, pos) and isinstance(tree[pos], Tree):
        tree[pos].pretty_print()

 IN
 |  
 of

       NP        
   ____|_____     
  DT        NNS  
  |          |    
these     animals

  DT 
  |   
these

  NNS  
   |    
animals



In [None]:
# Illustrating the 'ccommand' relation
WHNP = (0,)
for pos in tree.treepositions():
    if ccommands(WHNP, pos) and isinstance(tree[pos], Tree):
        tree[pos].pretty_print()

     VP           
  ___|___          
 |       NP       
 |    ___|____     
VBZ  DT       JJ  
 |   |        |    
 is  a      simian

VBZ
 |  
 is

     NP       
  ___|____     
 DT       JJ  
 |        |    
 a      simian

 DT
 |  
 a 

  JJ  
  |    
simian



In [None]:
# Illustrating the 'is_higher_than' relation
PP = (0,1)
for pos in tree.treepositions():
    if is_higher_than(PP, pos) and isinstance(tree[pos], Tree):
        tree[pos].pretty_print()

 WDT 
  |   
Which

 IN
 |  
 of

       NP        
   ____|_____     
  DT        NNS  
  |          |    
these     animals

  DT 
  |   
these

  NNS  
   |    
animals



# Relabel fronted Ss

In [None]:
def preposed_S(tree, pos):
    '''
    A preposed S node has the following properties:
    1. Followed by a comma
    2. First daughter is a VP, OR first daughter is a preposition
    '''
    if tree[pos].label().startswith('S'):
        if len(pos)>0:
            siblings = get_siblings(tree, pos)
            my_idx = pos[-1]
            if len(siblings) > my_idx+1:
                my_right_sib = pos[:-1] + (my_idx+1,)
                if tree[my_right_sib].label() == ',':
                    daughters = get_daughters(tree, pos)
                    if tree[daughters[0]].label().startswith('VP'):
                        return True
                    elif tree[daughters[0]].label() == 'IN':
                        return True
    return False

In [None]:
# Preposed Ss cause problems with the definitions below (particularly identifying main S node).
# In order to get around this, we will change relevant S categories to SPP.
from tqdm import tqdm

for i,tree in tqdm(list(enumerate(trees))):
    positions = [position for position in tree.treepositions() if isinstance(tree[position], Tree)]
    for position in positions:
        if preposed_S(tree, position):
            tree[position].set_label('SPP')

100%|██████████| 24795/24795 [00:21<00:00, 1144.44it/s]


In [None]:
# Looking at a representative example
trees[21991].pretty_print()

                                                                               SBARQ                                                                  
                   ______________________________________________________________|__________________________________________________________________   
                  |                              |               |                    VP                                                            | 
                  |                              |               |                ____|_________                                                    |  
                  |                              |               |               |    |         NP                                                  | 
                  |                              |               |               |    |     ____|___________                                        |  
                 SPP                             |               |               |    |    

# Identify main clause constituents

In [None]:
def bounding_node(tree, pos):
    '''
    Take a tree,node pair and determine whether the node is a bounding node.
    Bounding nodes have the following properties:
    1. The node is of category S, NP, or PP,
    2. The node and its mother are of different categories.
    '''
    if len(pos)>0:
        start_S = lambda p: tree[p].label().startswith('S')
        if start_S(pos) and not start_S(get_mother(tree,pos)):
            return True
        end_NPP = lambda p: tree[p].label().endswith('PP') or tree[p].label().endswith('NP')
        if end_NPP(pos) and not end_NPP(get_mother(tree,pos)):
            return True
    return False

def get_main_clause(tree):
    '''
    The main clause consists of all nodes dominated by zero bounding nodes.
    '''
    positions = [position for position in tree.treepositions() if isinstance(tree[position], Tree)]

    out = []
    for position in positions:
        ancestors = get_ancestors(tree, position)
        if not any(bounding_node(tree,ancestor) for ancestor in ancestors):
            out.append(position)

    return out

In [None]:
# Illustrating 'get_main_clause'
tree = trees[76]
main_clause = get_main_clause(tree)
for pos in main_clause:
    tree[pos].pretty_print()

                          SBARQ                        
            ________________|________________________   
          WHNP                        SQ             | 
   ________|_____                     |              |  
  |              PP                   VP             | 
  |     _________|____             ___|___           |  
 WHNP  |              NP          |       NP         | 
  |    |          ____|_____      |    ___|____      |  
 WDT   IN        DT        NNS   VBZ  DT       JJ    . 
  |    |         |          |     |   |        |     |  
Which  of      these     animals  is  a      simian  ? 

          WHNP                  
   ________|_____                
  |              PP             
  |     _________|____           
 WHNP  |              NP        
  |    |          ____|_____     
 WDT   IN        DT        NNS  
  |    |         |          |    
Which  of      these     animals

     SQ           
     |             
     VP           
  ___|___         

In [None]:
def get_maximal(tree, cats, exact='prefix'):
    '''
    A node of a given category is "maximal" if it is neither dominated by or c-commanded by another node of the given category within the main clause
    In this function, we can look for exact matches, prefixal matches, or suffixal matches
    '''
    if isinstance(cats, str):
        cats = [cats]
    main_clause = get_main_clause(tree)
    # Here we handle possible prefix matching.
    if exact=='exact': # With an "exact match" requirement, just check whether the exact label appears in the specified cats list
        candidates = [position for position in main_clause if tree[position].label() in cats]
    elif exact=='prefix': # Otherwise, check whether any specified cat is a prefix of the exact label
        candidates = [position for position in main_clause if any(tree[position].label().startswith(cat) for cat in cats)]
    elif exact=='suffix':
        candidates = [position for position in main_clause if any(tree[position].label().endswith(cat) for cat in cats)]
    # Return
    return [candidate for candidate in candidates if not any(is_higher_than(competitor, candidate) for competitor in candidates)]

In [None]:
# Illustrating 'get_maximal'
tree = trees[76]
max_NPs = get_maximal(tree, 'NP', exact='suffix')
for max_NP in max_NPs:
    tree[max_NP].pretty_print()

          WHNP                  
   ________|_____                
  |              PP             
  |     _________|____           
 WHNP  |              NP        
  |    |          ____|_____     
 WDT   IN        DT        NNS  
  |    |         |          |    
Which  of      these     animals



# Main S node

In [None]:
def get_main_S(tree):
    '''
    The main S node has the following properties:
    1. It is not dominated by anything other than S nodes,
    2. It dominates all maximal NPs and VPs,
    3. None of its descendants dominate all maximal NPs and VPs (i.e. the lowest such node).
    '''

    main_clause = get_main_clause(tree)

    # 1. Get S nodes dominated by nothing but other S nodes
    S_segments = []
    for position in main_clause:
        if tree[position].label().startswith('S'):
            ancestors = get_ancestors(tree, position)
            if (len(ancestors)==0) or all(tree[ancestor].label().startswith('S') for ancestor in ancestors):
                S_segments.append(position)
    # Bail if failed
    if len(S_segments)==0:
        return None

    # 2. Narrow down to only S segments that dominate all the maximal NPs and VPs
    max_NPs = get_maximal(tree, 'NP', exact='suffix')
    max_VPs = get_maximal(tree, 'VP', exact='prefix')
    dominant_Ss = []
    for S_segment in S_segments:
        descendants = get_descendants(tree, S_segment)
        if all(max_NP in descendants for max_NP in max_NPs) and all(max_VP in descendants for max_VP in max_VPs):
            dominant_Ss.append(S_segment)
    # Bail if failed
    if len(dominant_Ss)==0:
        return None

    # 3. Narrow down to the minimal such S node
    return max(dominant_Ss, key=lambda x: len(x)) # Use max here to get the longest index (i.e. lowest node)

In [None]:
# Illustrating 'get_main_S'
# Note that the 'main S' node may not be the root...
tree = trees[21572]
tree.pretty_print()
main_S = get_main_S(tree)
tree[main_S].pretty_print()

                     SBARQ                       
   ____________________|_______________________   
  |                    SQ                      | 
  |      ______________|__________________     |  
  |     |              NP                 |    | 
  |     |    __________|_____________     |    |  
WHADVP  |   |    |         NML       |    VP   | 
  |     |   |    |      ____|___     |    |    |  
 WRB   VBD  DT   JJ   NNP      NNP  NNP   VB   . 
  |     |   |    |     |        |    |    |    |  
Where  did the first  Hard     Rock Cafe open  ? 

               SQ                    
  _____________|__________________    
 |             NP                 |  
 |    _________|_____________     |   
 |   |    |        NML       |    VP 
 |   |    |     ____|___     |    |   
VBD  DT   JJ  NNP      NNP  NNP   VB 
 |   |    |    |        |    |    |   
did the first Hard     Rock Cafe open



In [None]:
# Checking bail rate
fails = [i for i,tree in tqdm(list(enumerate(trees))) if get_main_S(tree) is None]
passes = [j for j in range(len(trees)) if j not in fails]
print('Passes:', len(passes))
print('Fails:', len(fails))
print('Total:', len(trees))

100%|██████████| 24795/24795 [01:04<00:00, 386.35it/s]


Passes: 24519
Fails: 276
Total: 24795


In [None]:
# Check handling of edge cases
# Specifically, cases wehre the main S is not the root node
edge_cases = [i for i in tqdm(passes) if trees[i][get_main_S(trees[i])] != trees[i]]
print('\n', len(edge_cases))

100%|██████████| 24519/24519 [01:03<00:00, 383.83it/s]


 749





In [None]:
# If we want, we can look at the sentences
for i in edge_cases:
    print(i, parsed_questions['Question'][i])

17 If philosophers hosted daytime TV, on what show might you hear, "It is not a lack of love, but a lack of friendship that makes unhappy marriages?
70 On which classic sitcom were the actors who played the main family's "mom" and "dad" born on the exact same day: June 21, 1947?
143 Where will the Summer Olympic Games be held in the year 2000?
169 Where would a person go to put something into hock?
206 At which of these ages might a person typically have a midlife crisis?
216 In what children's game does a player call out, "Ready or not, here I come"?
260 Where would you find the “crypts of Lieberkühn”?
291 On what piece of household furniture would one be most likely to find a duvet?
294 In which country does most of the action in the movie "Casablanca" take place?
308 For what NBA team did former presidential candidate Bill Bradley play?
439 In what activity do people use siteswap notation to keep track of the many complex patterns?
497 Based on the lunar calendar, in which country i

# Main VP

In [None]:
auxes = {'is', 'are', 'am', 'was', 'were', 'do', 'did', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'can', 'could', 'shall', 'should', 'may', 'might', 'must'}

def get_main_VP(tree):
    '''
    The main VP node has the following properties:
    1. No other VP is higher than it,
    2. All S nodes that dominate it dominate all maximal VPs,
    '''
    # 1. Get he highest VPs
    max_VPs = get_maximal(tree, 'VP')
    # Exception: Sometimes we get an inverted aux classed as a VP
    max_VPs = [max_VP for max_VP in max_VPs if not (set(tree[max_VP].leaves()).issubset(auxes))]

    if not max_VPs:
        # Exception: Sometimes we get an SQ directly dominating the main V
        max_SQ = get_maximal(tree, 'SQ', exact='exact')
        if len(max_SQ) == 1:
            daughters = get_daughters(tree, max_SQ[0])
            if any(tree[daughter].label().startswith('VB') for daughter in daughters):
                max_VPs = max_SQ
    # Escape early if failed to find any VPs
    if not max_VPs:
        return None

    min_length = min([len(max_VP) for max_VP in max_VPs])
    main_VPs = [max_VP for max_VP in max_VPs if len(max_VP)==min_length]
    if len(main_VPs) > 1:
        return None
    main_VP = main_VPs[0]

    # Check if mother contains an NP to the left. If it *doesn't*, then take that node instead.
    left_siblings = [sibling for sibling in get_siblings(tree,main_VP) if sibling[-1]<main_VP[-1]]
    if not any(tree[sibling].label().endswith('NP') for sibling in left_siblings):
        main_VP = get_mother(tree, main_VP)

    # Check if mother is unary-branching. If so, then take that node instead
    mother = get_mother(tree, main_VP)
    daughters = get_daughters(tree, mother)
    while len(daughters) == 1:
        main_VP = tuple(mother)
        mother = get_mother(tree, main_VP)
        daughters = get_daughters(tree, mother)

    return main_VP

In [None]:
# Illustrating 'get_main_VP'
tree = trees[36]
tree.pretty_print()
main_VP = get_main_VP(tree)
tree[main_VP].pretty_print()

                                                            SBARQ                                              
       _______________________________________________________|______________________________________________   
      |                           S                                                                          | 
      |                 __________|_____________                                                             |  
      |                |                        VP                                                           | 
      |                |           _____________|___________________                                         |  
      |                |          |                                 NP                                       | 
      |                |          |              ___________________|_____________                           |  
      |                |          |             |                                 VP                

In [None]:
# This is not working -- I'm not sure why
'''
# Check bail rate
fails = [i for i,tree in enumerate(trees) if get_main_VP(tree) is None]
passes = [j for j in range(len(trees)) if j not in fails]
print('Passes:', len(passes))
print('Fails:', len(fails))
print('Total:', len(trees))
'''

"\n# Check bail rate\nfails = [i for i,tree in enumerate(trees) if get_main_VP(tree) is None]\npasses = [j for j in range(len(trees)) if j not in fails]\nprint('Passes:', len(passes))\nprint('Fails:', len(fails))\nprint('Total:', len(trees))\n"

In [None]:
'''
# If we want, we can look at the failures
for i in fails:
    print(i, parsed_questions['Question'][i])

trees[961].pretty_print()
'''

"\n# If we want, we can look at the failures\nfor i in fails:\n    print(i, parsed_questions['Question'][i])\n\ntrees[961].pretty_print()\n"

# Collecting labels

In [None]:
from tqdm import tqdm

labels = []
for tree in tqdm(trees):
    main_S = get_main_S(tree)
    if main_S is None: # Bail
        labels.append('manual')
    else:
        siblings = get_siblings(tree, main_S)
        if siblings is None: # Bail
            labels.append('manual')
        elif any(tree[sibling].label().startswith('WH') for sibling in siblings):
            # This will happen if we have a fronted wh-adverbial
            labels.append('ex-situ')
        else:
            daughters = get_daughters(tree, main_S)
            if daughters is None: # Bail
                labels.append('manual')
            elif any(tree[daughter].label() == 'WHNP' for daughter in daughters):
                main_VP = get_main_VP(tree)
                if main_VP is None: # Bail
                    labels.append('manual')
                elif main_VP in daughters:
                    labels.append('wh-subject')
                else:
                    labels.append('ex-situ')
            else:
                labels.append('in-situ')

100%|██████████| 24795/24795 [01:27<00:00, 281.98it/s]


In [None]:
parsed_questions['Label'] = labels
parsed_questions[['Contestant', 'Question', 'Label']].to_csv('labeled_questions.csv', index=False)
parsed_questions

Unnamed: 0,Contestant,Question,Parse,Label
0,Aaron Gold,"According to Good Housekeeping, most homeowner...",(S (PP (VBG According) (PP (IN to) (NP (NNP Go...,in-situ
1,Aaron Gold,"""There is a Sucker Born Ev'ry Minute"" and ""Joi...","(S (NP (`` "") (S (NP (EX There)) (VP (VBZ is) ...",in-situ
2,Aaron Gold,"Until it adopted the euro in 2002, fittingly, ...",(SBARQ (SBAR (IN Until) (S (NP (PRP it)) (VP (...,wh-subject
3,Aaron Gold,Which of these is the correct spelling of what...,(SBARQ (WHNP (WHNP (WDT Which)) (PP (IN of) (N...,wh-subject
4,Aaron Hauck,"In 2006, the WHO reported that thousands of pe...","(S (PP (IN In) (NP (CD 2006))) (, ,) (NP (DT t...",in-situ
...,...,...,...,...
24790,Zeke Spector,"As Crayola's website explains, ""while artists ...",(S (SBAR (IN As) (S (NP (NP (NNP Crayola) (POS...,in-situ
24791,Zeke Spector,"Born Amethyst Amelia Kelly, what recording art...",(SBARQ (S (VP (VBN Born) (NP (NNP Amethyst) (N...,wh-subject
24792,Zeke Spector,Which one of these things is NOT true of the p...,(SBARQ (WHNP (NP (WDT Which) (CD one)) (PP (IN...,manual
24793,Zeke Spector,Since in 1992 it was ruled that anyone in the ...,(S (SBAR (IN Since) (S (PP (IN in) (NP (CD 199...,in-situ




In [None]:
# Counts
print('# in-situ questions:', len(parsed_questions[parsed_questions['Label']=='in-situ']))
print('# ex-situ questions:', len(parsed_questions[parsed_questions['Label']=='ex-situ']))
print('# wh-subject questions:', len(parsed_questions[parsed_questions['Label']=='wh-subject']))
print('Total non-manual questions', len(parsed_questions[parsed_questions['Label']!='manual']))
print('Total questions', len(parsed_questions['Label']))

# in-situ questions: 12873
# ex-situ questions: 1880
# wh-subject questions: 9629
Total non-manual questions 24382
Total questions 24795


# Main Subj (Abandoned)

In [None]:
# I thought we might need to identify the main subject this way, but I went down a wrong path with this.

def get_main_subj(tree):
    '''
    The main subject should have these properties:
    1. It is a daughter of the main S node
    2. It is the closest such node to the main verb either on the left (first preference) or on the right
    '''

    # 1. Get NP daughters of main S
    main_S = get_main_S(tree)
    # If we fail to find a maximal S node, then we've failed. Return None.
    if main_S is None:
        return None
    daughters = get_daughters(tree, main_S)
    main_NPs = [daughter for daughter in daughters if tree[daughter].label().endswith('NP')]
    if len(main_NPs) == 0:
        return None
    if len(main_NPs) == 1:
        return main_NPs[0]

    # 2. Narrow down to the closest NP to the left of the main VP
    main_VP = get_main_VP(tree)
    if main_VP is None:
        return None
    # Get the main verb
    main_VBs = [VB for VB in get_daughters(tree, main_VP) if tree[VB].label().startswith('VB') or tree[VB].label().startswith('MD')]
    if not main_VBs:
        return None
    # If there are multiple, we want the leftmost within the VP
    main_VB = min(main_VBs, key=lambda x:x[-1]) # The key here gets the leftmost element
    # We need the position of the main VB
    VB_height = len(main_VB)-1
    NP_height = len(main_NPs[0])-1
    if VB_height > NP_height: # If the VB is lower than the NPs
        VB_idx = main_VB[NP_height]
        candidates = [NP for NP in main_NPs if NP[-1]<VB_idx]
        # Catch exceptions:
        if not candidates:
            return None
        # In this case, we expect NPs to the be on the left. Then we want the one with the least negative difference
        main_NP = max(candidates, key=lambda x: x[len(x)-1] - VB_idx) # If
    else: # If the VB is not lower than the NPs (they should be same height in this case)
        VB_idx = main_VB[-1]
        candidates = [NP for NP in main_NPs if NP[VB_height]>VB_idx]
        if not candidates:
            return None
        # In this case, we expect NPs to the be on the right. Then we want the one with the least negative difference
        main_NP = max(candidates, key=lambda x: VB_idx - x[len(x)-1])

    return main_NP

In [None]:
# Testing
tree = trees[63]
tree.pretty_print()
print('main_S')
main_S = get_main_S(tree)
tree[main_S].pretty_print()

print('main_VP')
main_VP = get_main_VP(tree)
tree[main_VP].pretty_print()

print('main_NP')
main_subj = get_main_subj(tree)
tree[main_subj].pretty_print()

                                                    SBARQ                                                           
           ___________________________________________|___________________________________________________________   
          |                                 |   |           SQ                                                    | 
          |                                 |   |      _____|_______________                                      |  
          |                                 |   |     |     |               VP                                    | 
          |                                 |   |     |     |         ______|___________                          |  
         SPP                                |   |     |     |        |                  S                         | 
   _______|____________                     |   |     |     |        |       ___________|___                      |  
  |                    S                    |   |     |     

In [None]:
# Check exceptions
fails = [i for i,tree in enumerate(trees) if get_main_subj(tree) is None]
passes = [j for j in range(len(trees)) if j not in fails]
print('Passes:', len(passes))
print('Fails:', len(fails))
print('Total:', len(trees))

Passes: 24029
Fails: 766
Total: 24795
