In [36]:
import os
from os.path import isfile, join
import logging
import re


In [37]:
markupStart = '[['
markupEnd = ']]'
rePattern = re.compile('\[\[[^\[\]]*\]\]')

In [38]:
def getRegexMatches(fileText):
    matches = list()
    for i,m in enumerate(rePattern.finditer(fileText)):
        matchedText = m.group().strip(markupStart).strip(markupEnd)
        matches.append([m.start()-i*4,len(matchedText),matchedText])
    return matches

def getUniqueNames(matches):
    uniqueNames = set()
    for _,_,name in matches:
        for partName in name.split(' '):
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            uniqueNames.add(partName)
    return uniqueNames

In [71]:
def validateSquareBrackets(fileText,fileName):
    markupStartCount = fileText.count(markupStart)
    markupEndCount = fileText.count(markupEnd)
    if markupStartCount != markupEndCount:
        logging.error(" ValidateSquareBrackets failed: {} {}: {} {}: {}"
               .format(fileName.split('/')[-1],markupStart,markupStartCount,markupEnd,markupEndCount))
        return False
    return True

def validateOccurences(fileText,fileName):
    global totalMarkups,totalUniqueMarkups
    matches = getRegexMatches(fileText)
    totalMarkups+=len(matches)
    matchesDict = dict()
    
    for i,_,name in matches:
        currStartOffset = 0;
        for partName in name.split(' '):
            currStartIndex = i + currStartOffset;
            currStartOffset+= len(partName)+1
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            matchesDict[currStartIndex] = partName

    uniqueNames = getUniqueNames(matches)
    totalUniqueMarkups |=uniqueNames
    fileTextOrig = fileText.replace(markupStart,'').replace(markupEnd,'')
    for un in sorted(uniqueNames):
        if len(un)<=1:
            continue
        unPattern = re.compile('')
        try:
            unPattern = re.compile(r'\b'+ un+ r'\b')
        except:
            logging.error(" validateOccurences {} : Possible wrong markup: {} ".format(fileName.split('/')[-1],un))
            continue
        occurences = [m.start() for m in unPattern.finditer(fileTextOrig)]
        for occ in occurences:
            if occ > 0 and fileTextOrig[occ-1]!=' ':
                continue;
            if occ not in matchesDict:
                logging.error(" validateOccurences {} : {} not marked around location {}".format(fileName.split('/')[-1],un,occ))
                continue
            if matchesDict[occ]!=un :
                logging.error(" validateOccurences {} : {} invalid match around location {}".format(fileName.split('/')[-1],un,occ))
            

def validateFile(fileName):
    fileText = open(fileName).read()
    if not validateSquareBrackets(fileText,fileName):
        return
    validateOccurences(fileText,fileName)


def checkAgainstAllMarkups(fileText,fileName):
    matches = getRegexMatches(fileText)
    matchesDict = dict()
    global totalUniqueMarkups
    for i,_,name in matches:
        currStartOffset = 0;
        for partName in name.split(' '):
            currStartIndex = i + currStartOffset;
            currStartOffset+= len(partName)+1
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            matchesDict[currStartIndex] = partName

    fileTextOrig = fileText.replace(markupStart,'').replace(markupEnd,'')
    for un in sorted(totalUniqueMarkups):
        if len(un)<=1:
            continue
        unPattern = re.compile('')
        try:
            unPattern = re.compile(r'\b'+ un+ r'\b')
        except:
            continue
        occurences = [m.start() for m in unPattern.finditer(fileTextOrig)]
        for occ in occurences:
            if occ > 0 and fileTextOrig[occ-1]!=' ':
                continue;
            if occ not in matchesDict:
                logging.error(" checkAgainstAllMarkups {} : {} not marked around location {}".format(fileName.split('/')[-1],un,occ))
                continue
    

In [87]:
folderNames = ['Abhinav','Bidyut']
folderPath = '../dataset_markup/'
totalMarkups = 0
totalUniqueMarkups = set()
for folderName in sorted(folderNames):
    folderName = join(folderPath,folderName)
    for fileName in sorted(os.listdir(folderName)):
        fileName = join(folderName,fileName)
        if fileName.endswith('.txt') and isfile(fileName):
            validateFile(fileName)
print("totalMarkups:",totalMarkups)
print("totalUniqueMarkups:",len(totalUniqueMarkups))

ERROR:root: ValidateSquareBrackets failed: 107.txt [[: 13 ]]: 14
ERROR:root: validateOccurences 112.txt : Glamorgan not marked around location 1227
ERROR:root: validateOccurences 116.txt : Williams invalid match around location 1464
ERROR:root: validateOccurences 120.txt : Dementieva not marked around location 558
ERROR:root: validateOccurences 120.txt : Henin-Hardenne not marked around location 485
ERROR:root: validateOccurences 156.txt : Collins invalid match around location 328
ERROR:root: validateOccurences 160.txt : Hal not marked around location 756
ERROR:root: validateOccurences 160.txt : Johnson not marked around location 1090
ERROR:root: validateOccurences 160.txt : Parker not marked around location 505
ERROR:root: validateOccurences 163.txt : Thomas invalid match around location 706
ERROR:root: validateOccurences 166.txt : Hurter not marked around location 738
ERROR:root: validateOccurences 170.txt : Bellamy not marked around location 274
ERROR:root: validateOccurences 179.tx

totalMarkups: 1700
totalUniqueMarkups: 903


In [88]:
for folderName in sorted(folderNames):
    folderName = join(folderPath,folderName)
    for fileName in sorted(os.listdir(folderName)):
        fileName = join(folderName,fileName)
        if fileName.endswith('.txt') and isfile(fileName):
            fileText = open(fileName).read()
            checkAgainstAllMarkups(fileText,fileName)

ERROR:root: checkAgainstAllMarkups 001.txt : flanker not marked around location 66
ERROR:root: checkAgainstAllMarkups 001.txt : flanker not marked around location 545
ERROR:root: checkAgainstAllMarkups 001.txt : flanker not marked around location 890
ERROR:root: checkAgainstAllMarkups 003.txt : Glamorgan not marked around location 15
ERROR:root: checkAgainstAllMarkups 003.txt : Glamorgan not marked around location 129
ERROR:root: checkAgainstAllMarkups 003.txt : Glamorgan not marked around location 242
ERROR:root: checkAgainstAllMarkups 006.txt : Friday not marked around location 1132
ERROR:root: checkAgainstAllMarkups 015.txt : Friday not marked around location 414
ERROR:root: checkAgainstAllMarkups 019.txt : James not marked around location 190
ERROR:root: checkAgainstAllMarkups 020.txt : de not marked around location 580
ERROR:root: checkAgainstAllMarkups 024.txt : Jose not marked around location 29
ERROR:root: checkAgainstAllMarkups 024.txt : Jose not marked around location 377
ERR

ERROR:root: checkAgainstAllMarkups 075.txt : Adam not marked around location 973
ERROR:root: checkAgainstAllMarkups 075.txt : Andre not marked around location 515
ERROR:root: checkAgainstAllMarkups 075.txt : Boje not marked around location 323
ERROR:root: checkAgainstAllMarkups 075.txt : Boje not marked around location 951
ERROR:root: checkAgainstAllMarkups 075.txt : Boje not marked around location 1019
ERROR:root: checkAgainstAllMarkups 075.txt : Boucher not marked around location 1044
ERROR:root: checkAgainstAllMarkups 075.txt : Elizabeth not marked around location 63
ERROR:root: checkAgainstAllMarkups 075.txt : Gibbs not marked around location 1077
ERROR:root: checkAgainstAllMarkups 075.txt : Graeme not marked around location 365
ERROR:root: checkAgainstAllMarkups 075.txt : Jacques not marked around location 466
ERROR:root: checkAgainstAllMarkups 075.txt : Justin not marked around location 957
ERROR:root: checkAgainstAllMarkups 075.txt : Kallis not marked around location 474
ERROR:r

ERROR:root: checkAgainstAllMarkups 083.txt : Andrew not marked around location 1039
ERROR:root: checkAgainstAllMarkups 083.txt : Andy not marked around location 237
ERROR:root: checkAgainstAllMarkups 083.txt : Andy not marked around location 440
ERROR:root: checkAgainstAllMarkups 083.txt : Bell not marked around location 198
ERROR:root: checkAgainstAllMarkups 083.txt : Bell not marked around location 1545
ERROR:root: checkAgainstAllMarkups 083.txt : Corry not marked around location 1495
ERROR:root: checkAgainstAllMarkups 083.txt : Dawson not marked around location 1603
ERROR:root: checkAgainstAllMarkups 083.txt : Duncan not marked around location 191
ERROR:root: checkAgainstAllMarkups 083.txt : Ellis not marked around location 1328
ERROR:root: checkAgainstAllMarkups 083.txt : Friday not marked around location 1159
ERROR:root: checkAgainstAllMarkups 083.txt : Matt not marked around location 63
ERROR:root: checkAgainstAllMarkups 083.txt : Matt not marked around location 601
ERROR:root: c

ERROR:root: checkAgainstAllMarkups 093.txt : Katerina not marked around location 307
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 9
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 40
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 283
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 498
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 1027
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 1460
ERROR:root: checkAgainstAllMarkups 093.txt : Nick not marked around location 1112
ERROR:root: checkAgainstAllMarkups 093.txt : Thanou not marked around location 316
ERROR:root: checkAgainstAllMarkups 093.txt : Thanou not marked around location 511
ERROR:root: checkAgainstAllMarkups 093.txt : Thanou not marked around location 1518
ERROR:root: checkAgainstAllMarkups 094.txt : Abdul not marked around locat

ERROR:root: checkAgainstAllMarkups 121.txt : Davis not marked around location 832
ERROR:root: checkAgainstAllMarkups 122.txt : Beckham not marked around location 775
ERROR:root: checkAgainstAllMarkups 122.txt : David not marked around location 769
ERROR:root: checkAgainstAllMarkups 122.txt : Eriksson not marked around location 806
ERROR:root: checkAgainstAllMarkups 122.txt : Holland not marked around location 912
ERROR:root: checkAgainstAllMarkups 122.txt : Michael not marked around location 939
ERROR:root: checkAgainstAllMarkups 122.txt : Owen not marked around location 947
ERROR:root: checkAgainstAllMarkups 122.txt : Sven-Goran not marked around location 795
ERROR:root: checkAgainstAllMarkups 122.txt : Thomas not marked around location 522
ERROR:root: checkAgainstAllMarkups 122.txt : Zidane not marked around location 622
ERROR:root: checkAgainstAllMarkups 122.txt : Zidane not marked around location 703
ERROR:root: checkAgainstAllMarkups 123.txt : Butragueno not marked around location

ERROR:root: checkAgainstAllMarkups 127.txt : Henman not marked around location 1045
ERROR:root: checkAgainstAllMarkups 127.txt : Murray not marked around location 9
ERROR:root: checkAgainstAllMarkups 127.txt : Murray not marked around location 274
ERROR:root: checkAgainstAllMarkups 127.txt : Peter not marked around location 409
ERROR:root: checkAgainstAllMarkups 127.txt : Roddick not marked around location 735
ERROR:root: checkAgainstAllMarkups 127.txt : Roger not marked around location 715
ERROR:root: checkAgainstAllMarkups 128.txt : Brian not marked around location 578
ERROR:root: checkAgainstAllMarkups 128.txt : Eddie not marked around location 49
ERROR:root: checkAgainstAllMarkups 128.txt : Murphy not marked around location 625
ERROR:root: checkAgainstAllMarkups 128.txt : O'Driscoll not marked around location 584
ERROR:root: checkAgainstAllMarkups 128.txt : O'Sullivan not marked around location 0
ERROR:root: checkAgainstAllMarkups 128.txt : O'Sullivan not marked around location 55


ERROR:root: checkAgainstAllMarkups 133.txt : Jeremy not marked around location 1058
ERROR:root: checkAgainstAllMarkups 133.txt : Kaif not marked around location 1874
ERROR:root: checkAgainstAllMarkups 133.txt : Kapil not marked around location 17
ERROR:root: checkAgainstAllMarkups 133.txt : Kapil not marked around location 134
ERROR:root: checkAgainstAllMarkups 133.txt : Karthik not marked around location 1882
ERROR:root: checkAgainstAllMarkups 133.txt : Khaled not marked around location 1217
ERROR:root: checkAgainstAllMarkups 133.txt : Khaled not marked around location 1694
ERROR:root: checkAgainstAllMarkups 133.txt : Khan not marked around location 1160
ERROR:root: checkAgainstAllMarkups 133.txt : Khan not marked around location 1936
ERROR:root: checkAgainstAllMarkups 133.txt : Kumble not marked around location 0
ERROR:root: checkAgainstAllMarkups 133.txt : Kumble not marked around location 493
ERROR:root: checkAgainstAllMarkups 133.txt : Kumble not marked around location 1429
ERROR:

ERROR:root: checkAgainstAllMarkups 138.txt : Patrick not marked around location 903
ERROR:root: checkAgainstAllMarkups 138.txt : Pelous not marked around location 1365
ERROR:root: checkAgainstAllMarkups 138.txt : Pierre not marked around location 1495
ERROR:root: checkAgainstAllMarkups 138.txt : Stuart not marked around location 764
ERROR:root: checkAgainstAllMarkups 138.txt : Thomas not marked around location 853
ERROR:root: checkAgainstAllMarkups 138.txt : Villiers not marked around location 532
ERROR:root: checkAgainstAllMarkups 138.txt : William not marked around location 1325
ERROR:root: checkAgainstAllMarkups 138.txt : de not marked around location 529
ERROR:root: checkAgainstAllMarkups 138.txt : flanker not marked around location 54
ERROR:root: checkAgainstAllMarkups 139.txt : Giggs not marked around location 22
ERROR:root: checkAgainstAllMarkups 139.txt : Giggs not marked around location 39
ERROR:root: checkAgainstAllMarkups 139.txt : Giggs not marked around location 1080
ERROR

ERROR:root: checkAgainstAllMarkups 146.txt : Cicero not marked around location 1223
ERROR:root: checkAgainstAllMarkups 146.txt : Clive not marked around location 295
ERROR:root: checkAgainstAllMarkups 146.txt : Cusiter not marked around location 1155
ERROR:root: checkAgainstAllMarkups 146.txt : Dallaglio not marked around location 911
ERROR:root: checkAgainstAllMarkups 146.txt : Fabien not marked around location 189
ERROR:root: checkAgainstAllMarkups 146.txt : Gordon not marked around location 204
ERROR:root: checkAgainstAllMarkups 146.txt : Humphreys not marked around location 1109
ERROR:root: checkAgainstAllMarkups 146.txt : Lo not marked around location 1220
ERROR:root: checkAgainstAllMarkups 146.txt : Marco not marked around location 243
ERROR:root: checkAgainstAllMarkups 146.txt : O'Connell not marked around location 1086
ERROR:root: checkAgainstAllMarkups 146.txt : O'Driscoll not marked around location 50
ERROR:root: checkAgainstAllMarkups 146.txt : O'Driscoll not marked around l