In [None]:
import os
from os.path import isfile, join
import logging
import re

In [None]:
markupStart = '[['
markupEnd = ']]'
rePattern = re.compile('\[\[[^\[\]]*\]\]')

In [None]:
def getRegexMatches(fileText):
    matches = list()
    for i,m in enumerate(rePattern.finditer(fileText)):
        matchedText = m.group().strip(markupStart).strip(markupEnd)
        matches.append([m.start()-i*4,len(matchedText),matchedText])
    return matches

def getUniqueNames(matches):
    uniqueNames = set()
    for _,_,name in matches:
        for partName in name.split(' '):
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            uniqueNames.add(partName)
    return uniqueNames

In [None]:
def validateSquareBrackets(fileText,fileName):
    markupStartCount = fileText.count(markupStart)
    markupEndCount = fileText.count(markupEnd)
    if markupStartCount != markupEndCount:
        logging.error (" ValidateSquareBrackets failed: {} {}: {} {}: {}"
               .format(fileName.split('/')[-1],markupStart,markupStartCount,markupEnd,markupEndCount))
        return False
    return True

def validateOccurences(fileText,fileName):
    matches = getRegexMatches(fileText)
    matchesDict = dict()
    
    for i,_,name in matches:
        currStartOffset = 0;
        for partName in name.split(' '):
            currStartIndex = i + currStartOffset;
            currStartOffset+= len(partName)+1
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            matchesDict[currStartIndex] = partName

    uniqueNames = getUniqueNames(matches)
    fileTextOrig = fileText.replace(markupStart,'').replace(markupEnd,'')
    for un in sorted(uniqueNames):
        unPattern = re.compile('')
        try:
            unPattern = re.compile(r'\b'+ un+ r'\b')
        except:
            logging.error(" validateOccurences {} Possible wrong markup: {} ".format(fileName.split('/')[-1],un))
            continue
        occurences = [m.start() for m in unPattern.finditer(fileTextOrig)]
        for occ in occurences:
            if occ > 0 and fileTextOrig[occ-1]!=' ':
                continue;
            if occ not in matchesDict:
                logging.error(" validateOccurences {} : {} not marked around location {}".format(fileName.split('/')[-1],un,occ))
                continue
            if matchesDict[occ]!=un :
                logging.error(" validateOccurences {} : {} invalid match around location {}".format(fileName.split('/')[-1],un,occ))
            

def validateFile(fileName):
    fileText = open(fileName).read()
    if not validateSquareBrackets(fileText,fileName):
        return
    validateOccurences(fileText,fileName)

        
    

In [None]:
folderNames = ['Abhinav','Bidyut']
folderPath = '../dataset_markup/'

for folderName in sorted(folderNames):
    folderName = join(folderPath,folderName)
    for fileName in sorted(os.listdir(folderName)):
        fileName = join(folderName,fileName)
        if fileName.endswith('.txt') and isfile(fileName):
            validateFile(fileName)