In [1]:
import os
from os.path import isfile, join
import logging
import re


In [2]:
markupStart = '[['
markupEnd = ']]'
rePattern = re.compile('\[\[[^\[\]]*\]\]')

In [3]:
def getRegexMatches(fileText):
    matches = list()
    for i,m in enumerate(rePattern.finditer(fileText)):
        matchedText = m.group().strip(markupStart).strip(markupEnd)
        matches.append([m.start()-i*4,len(matchedText),matchedText])
    return matches

def getUniqueNames(matches):
    uniqueNames = set()
    for _,_,name in matches:
        for partName in name.split(' '):
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            uniqueNames.add(partName)
    return uniqueNames

In [4]:
def validateSquareBrackets(fileText,fileName):
    markupStartCount = fileText.count(markupStart)
    markupEndCount = fileText.count(markupEnd)
    if markupStartCount != markupEndCount:
        logging.error(" ValidateSquareBrackets failed: {} {}: {} {}: {}"
               .format(fileName.split('/')[-1],markupStart,markupStartCount,markupEnd,markupEndCount))
        return False
    return True

def validateOccurences(fileText,fileName):
    global totalMarkups,totalUniqueMarkups
    matches = getRegexMatches(fileText)
    totalMarkups+=len(matches)
    matchesDict = dict()
    
    for i,_,name in matches:
        currStartOffset = 0;
        for partName in name.split(' '):
            currStartIndex = i + currStartOffset;
            currStartOffset+= len(partName)+1
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            matchesDict[currStartIndex] = partName

    uniqueNames = getUniqueNames(matches)
    totalUniqueMarkups |=uniqueNames
    fileTextOrig = fileText.replace(markupStart,'').replace(markupEnd,'')
    for un in sorted(uniqueNames):
        if len(un)<=1:
            continue
        unPattern = re.compile('')
        try:
            unPattern = re.compile(r'\b'+ un+ r'\b')
        except:
            logging.error(" validateOccurences {} : Possible wrong markup: {} ".format(fileName.split('/')[-1],un))
            continue
        occurences = [m.start() for m in unPattern.finditer(fileTextOrig)]
        for occ in occurences:
            if occ > 0 and fileTextOrig[occ-1]!=' ':
                continue;
            if occ not in matchesDict:
                logging.error(" validateOccurences {} : {} not marked around location {}".format(fileName.split('/')[-1],un,occ))
                continue
            if matchesDict[occ]!=un :
                logging.error(" validateOccurences {} : {} invalid match around location {}".format(fileName.split('/')[-1],un,occ))
            

def validateFile(fileName):
    fileText = open(fileName).read()
    if not validateSquareBrackets(fileText,fileName):
        return
    validateOccurences(fileText,fileName)


def checkAgainstAllMarkups(fileText,fileName):
    matches = getRegexMatches(fileText)
    matchesDict = dict()
    global totalUniqueMarkups
    for i,_,name in matches:
        currStartOffset = 0;
        for partName in name.split(' '):
            currStartIndex = i + currStartOffset;
            currStartOffset+= len(partName)+1
            if(partName.endswith('\'s')):
                partName = partName[:-2]
            matchesDict[currStartIndex] = partName

    fileTextOrig = fileText.replace(markupStart,'').replace(markupEnd,'')
    for un in sorted(totalUniqueMarkups):
        if len(un)<=1:
            continue
        unPattern = re.compile('')
        try:
            unPattern = re.compile(r'\b'+ un+ r'\b')
        except:
            continue
        occurences = [m.start() for m in unPattern.finditer(fileTextOrig)]
        for occ in occurences:
            if occ > 0 and fileTextOrig[occ-1]!=' ':
                continue;
            if occ not in matchesDict:
                logging.error(" checkAgainstAllMarkups {} : {} not marked around location {}".format(fileName.split('/')[-1],un,occ))
                continue
    

In [5]:
folderNames = ['Abhinav','Bidyut']
folderPath = '../dataset_markup/'
totalMarkups = 0
totalUniqueMarkups = set()
for folderName in sorted(folderNames):
    folderName = join(folderPath,folderName)
    for fileName in sorted(os.listdir(folderName)):
        fileName = join(folderName,fileName)
        if fileName.endswith('.txt') and isfile(fileName):
            validateFile(fileName)
print("totalMarkups:",totalMarkups)
print("totalUniqueMarkups:",len(totalUniqueMarkups))

ERROR:root: validateOccurences 116.txt : Williams invalid match around location 1464
ERROR:root: validateOccurences 131.txt : Pires invalid match around location 1526
ERROR:root: validateOccurences 135.txt : Thomas invalid match around location 260
ERROR:root: validateOccurences 139.txt : Giggs invalid match around location 39
ERROR:root: validateOccurences 156.txt : Collins invalid match around location 328
ERROR:root: validateOccurences 160.txt : Parker not marked around location 505
ERROR:root: validateOccurences 163.txt : Thomas invalid match around location 706


totalMarkups: 2216
totalUniqueMarkups: 1111


In [6]:
for folderName in sorted(folderNames):
    folderName = join(folderPath,folderName)
    for fileName in sorted(os.listdir(folderName)):
        fileName = join(folderName,fileName)
        if fileName.endswith('.txt') and isfile(fileName):
            fileText = open(fileName).read()
            checkAgainstAllMarkups(fileText,fileName)

ERROR:root: checkAgainstAllMarkups 006.txt : Friday not marked around location 1132
ERROR:root: checkAgainstAllMarkups 015.txt : Friday not marked around location 414
ERROR:root: checkAgainstAllMarkups 019.txt : James not marked around location 190
ERROR:root: checkAgainstAllMarkups 020.txt : de not marked around location 580
ERROR:root: checkAgainstAllMarkups 024.txt : Jose not marked around location 29
ERROR:root: checkAgainstAllMarkups 024.txt : Jose not marked around location 377
ERROR:root: checkAgainstAllMarkups 030.txt : Davis not marked around location 1204
ERROR:root: checkAgainstAllMarkups 034.txt : Holland not marked around location 967
ERROR:root: checkAgainstAllMarkups 034.txt : de not marked around location 1130
ERROR:root: checkAgainstAllMarkups 040.txt : James not marked around location 1224
ERROR:root: checkAgainstAllMarkups 040.txt : Welshman not marked around location 117
ERROR:root: checkAgainstAllMarkups 043.txt : Milan not marked around location 1062
ERROR:root: c

ERROR:root: checkAgainstAllMarkups 075.txt : Graeme not marked around location 365
ERROR:root: checkAgainstAllMarkups 075.txt : Jacques not marked around location 466
ERROR:root: checkAgainstAllMarkups 075.txt : Justin not marked around location 957
ERROR:root: checkAgainstAllMarkups 075.txt : Kallis not marked around location 474
ERROR:root: checkAgainstAllMarkups 075.txt : Kemp not marked around location 964
ERROR:root: checkAgainstAllMarkups 075.txt : Kemp not marked around location 1086
ERROR:root: checkAgainstAllMarkups 075.txt : Langeveldt not marked around location 1094
ERROR:root: checkAgainstAllMarkups 075.txt : Makhaya not marked around location 497
ERROR:root: checkAgainstAllMarkups 075.txt : Ntini not marked around location 505
ERROR:root: checkAgainstAllMarkups 075.txt : Pollock not marked around location 488
ERROR:root: checkAgainstAllMarkups 075.txt : Port not marked around location 58
ERROR:root: checkAgainstAllMarkups 075.txt : Rogers not marked around location 1172
ER

ERROR:root: checkAgainstAllMarkups 083.txt : Corry not marked around location 1495
ERROR:root: checkAgainstAllMarkups 083.txt : Dawson not marked around location 1603
ERROR:root: checkAgainstAllMarkups 083.txt : Duncan not marked around location 191
ERROR:root: checkAgainstAllMarkups 083.txt : Ellis not marked around location 1328
ERROR:root: checkAgainstAllMarkups 083.txt : Friday not marked around location 1159
ERROR:root: checkAgainstAllMarkups 083.txt : Matt not marked around location 63
ERROR:root: checkAgainstAllMarkups 083.txt : Matt not marked around location 601
ERROR:root: checkAgainstAllMarkups 083.txt : Noon not marked around location 1247
ERROR:root: checkAgainstAllMarkups 083.txt : Phil not marked around location 509
ERROR:root: checkAgainstAllMarkups 083.txt : Robinson not marked around location 242
ERROR:root: checkAgainstAllMarkups 083.txt : Robinson not marked around location 675
ERROR:root: checkAgainstAllMarkups 083.txt : Robinson not marked around location 821
ERRO

ERROR:root: checkAgainstAllMarkups 091.txt : Robson not marked around location 608
ERROR:root: checkAgainstAllMarkups 091.txt : Robson not marked around location 1212
ERROR:root: checkAgainstAllMarkups 093.txt : Davies not marked around location 1117
ERROR:root: checkAgainstAllMarkups 093.txt : Ioannidis not marked around location 142
ERROR:root: checkAgainstAllMarkups 093.txt : Ioannidis not marked around location 718
ERROR:root: checkAgainstAllMarkups 093.txt : Ioannidis not marked around location 804
ERROR:root: checkAgainstAllMarkups 093.txt : Katerina not marked around location 307
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 9
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 40
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 283
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked around location 498
ERROR:root: checkAgainstAllMarkups 093.txt : Kenteris not marked arou

ERROR:root: checkAgainstAllMarkups 098.txt : Serge not marked around location 1507
ERROR:root: checkAgainstAllMarkups 098.txt : Yachvili not marked around location 894
ERROR:root: checkAgainstAllMarkups 098.txt : Yannick not marked around location 1646
ERROR:root: checkAgainstAllMarkups 098.txt : de not marked around location 714
ERROR:root: checkAgainstAllMarkups 099.txt : Bashar not marked around location 0
ERROR:root: checkAgainstAllMarkups 099.txt : Bashar not marked around location 62
ERROR:root: checkAgainstAllMarkups 099.txt : Habibul not marked around location 54
ERROR:root: checkAgainstAllMarkups 100.txt : Molik not marked around location 889
ERROR:root: checkAgainstAllMarkups 100.txt : Serena not marked around location 228
ERROR:root: checkAgainstAllMarkups 100.txt : Serena not marked around location 491
ERROR:root: checkAgainstAllMarkups 100.txt : Serena not marked around location 679
ERROR:root: checkAgainstAllMarkups 100.txt : Serena not marked around location 996
ERROR:ro