In [28]:
import os
from os.path import isfile, join
import logging
import re
import commonUtils

In [29]:
markupStart = '[['
markupEnd = ']]'
rePattern = re.compile('\[\[[^\[\]]*\]\]')

In [30]:
def getRegexMatches(fileText):
    matches = list()
    for i,m in enumerate(rePattern.finditer(fileText)):
        matchedText = m.group().strip(markupStart).strip(markupEnd)
        matches.append([m.start()-i*4,len(matchedText),matchedText])
    return matches

def getUniqueNames(matches):
    uniqueNames = set()
    for _,_,name in matches:
        for partName in name.split(' '):
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            partName = partName.rstrip('\'')
            uniqueNames.add(partName)
    return uniqueNames

In [31]:
def validateSquareBrackets(fileText,fileName):
    markupStartCount = fileText.count(markupStart)
    markupEndCount = fileText.count(markupEnd)
    if markupStartCount != markupEndCount:
        logging.error(" ValidateSquareBrackets failed: {} {}: {} {}: {}"
               .format(fileName.split('/')[-1],markupStart,markupStartCount,markupEnd,markupEndCount))
        return False
    return True

def validateOccurences(fileText,fileName):
    global totalMarkups,totalUniqueMarkups
    matches = getRegexMatches(fileText)
    totalMarkups+=len(matches)
    matchesDict = dict()
    
    for i,_,name in matches:
        currStartOffset = 0;
        for partName in name.split(' '):
            currStartIndex = i + currStartOffset;
            currStartOffset+= len(partName)+1
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            partName = partName.rstrip('\'')
            matchesDict[currStartIndex] = partName

    uniqueNames = getUniqueNames(matches)
    totalUniqueMarkups |=uniqueNames
    fileTextOrig = fileText.replace(markupStart,'').replace(markupEnd,'')
    for un in sorted(uniqueNames):
        if len(un)<=1:
            continue
        unPattern = re.compile('')
        try:
            unPattern = re.compile(r'\b'+ un+ r'\b')
        except:
            logging.error(" validateOccurences {} : Possible wrong markup: {} ".format(fileName.split('/')[-1],un))
            continue
        occurences = [m.start() for m in unPattern.finditer(fileTextOrig)]
        for occ in occurences:
            if occ > 0 and fileTextOrig[occ-1]!=' ':
                continue;
            if occ not in matchesDict:
                logging.error(" validateOccurences {} : {} not marked around location {}".format(fileName.split('/')[-1],un,occ))
                continue
            if matchesDict[occ]!=un :
                logging.error(" validateOccurences {} : {} invalid match around location {}".format(fileName.split('/')[-1],un,occ))
            

def validateFile(fileName):
    fileText = open(fileName).read()
    if not validateSquareBrackets(fileText,fileName):
        return
    validateOccurences(fileText,fileName)


def checkAgainstAllMarkups(fileText,fileName):
    matches = getRegexMatches(fileText)
    matchesDict = dict()
    global totalUniqueMarkups
    for i,_,name in matches:
        currStartOffset = 0;
        for partName in name.split(' '):
            currStartIndex = i + currStartOffset;
            currStartOffset+= len(partName)+1
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            matchesDict[currStartIndex] = partName

    fileTextOrig = fileText.replace(markupStart,'').replace(markupEnd,'')
    for un in sorted(totalUniqueMarkups):
        if len(un)<=1:
            continue
        unPattern = re.compile('')
        try:
            unPattern = re.compile(r'\b'+ un+ r'\b')
        except:
            continue
        occurences = [m.start() for m in unPattern.finditer(fileTextOrig)]
        for occ in occurences:
            if occ > 0 and fileTextOrig[occ-1]!=' ':
                continue;
            if occ not in matchesDict:
                logging.error(" checkAgainstAllMarkups {} : {} not marked around location {}".format(fileName.split('/')[-1],un,occ))
                continue
    

In [34]:
folderNames = ['Abhinav','Bidyut','Chirayu']
folderPath = '../dataset_markup/'
totalMarkups = 0
totalUniqueMarkups = set()
for fileName in commonUtils.getAllFiles(folderNames,folderPath):
    validateFile(fileName)
print("totalMarkups:",totalMarkups)
print("totalUniqueMarkups:",len(totalUniqueMarkups))

ERROR:root: validateOccurences 094.txt : Inzamam invalid match around location 301
ERROR:root: validateOccurences 224.txt : Ireland not marked around location 1821
ERROR:root: validateOccurences 340.txt : Bedford not marked around location 592


totalMarkups: 3951
totalUniqueMarkups: 1599


In [33]:
for folderName in sorted(folderNames):
    folderName = join(folderPath,folderName)
    for fileName in sorted(os.listdir(folderName)):
        fileName = join(folderName,fileName)
        if fileName.endswith('.txt') and isfile(fileName):
            fileText = open(fileName).read()
            checkAgainstAllMarkups(fileText,fileName)

ERROR:root: checkAgainstAllMarkups 006.txt : Friday not marked around location 1132
ERROR:root: checkAgainstAllMarkups 015.txt : Friday not marked around location 414
ERROR:root: checkAgainstAllMarkups 019.txt : James not marked around location 190
ERROR:root: checkAgainstAllMarkups 020.txt : de not marked around location 580
ERROR:root: checkAgainstAllMarkups 024.txt : Jose not marked around location 29
ERROR:root: checkAgainstAllMarkups 024.txt : Jose not marked around location 377
ERROR:root: checkAgainstAllMarkups 034.txt : Holland not marked around location 967
ERROR:root: checkAgainstAllMarkups 034.txt : de not marked around location 1130
ERROR:root: checkAgainstAllMarkups 040.txt : James not marked around location 1224
ERROR:root: checkAgainstAllMarkups 043.txt : Milan not marked around location 1062
ERROR:root: checkAgainstAllMarkups 049.txt : Holland not marked around location 434
ERROR:root: checkAgainstAllMarkups 058.txt : Charlton not marked around location 756
ERROR:root: 

KeyboardInterrupt: 