In [None]:
import os
from os.path import isfile, join
import logging
import re
import commonUtils
import constants
import numpy as np

In [10]:
def getCleanedWordListFromFile(fileName):
    wordlist = []
    fileText = open(fileName).read()
    for line in fileText.splitlines():
        for pattern in constants.nullReplaceList:
            line = line.replace(pattern,'')
        for pattern in constants.spaceReplaceList:
            line = line.replace(pattern,' ')
        for word in line.split(' '):
            word = word.strip('\'')
            word = word.replace("[[[","[[")
            word = word.replace("]]]","]]")
            if(len(word)>=1):
                wordlist.append(word)
    return wordlist

def getStringCombinationsFromWordList(wordlist):
    allPossibeStringCombinations = []
    for i in range(0,len(wordlist)):
        currString = ""
        for j in range(0,3):
            if(i+j>=len(wordlist)):
                break
            if wordlist[i+j].lower() in [ig.lower() for ig in constants.wordsToIgnoreList]:
                break
            if any(char.isdigit() for char in wordlist[i+j]) :
                break
            if j > 0:
                currString+=' '
            currString+= wordlist[i+j]
            if len(currString.strip(' ')) >=1:
                allPossibeStringCombinations.append([currString.strip(' '),i,i+j])
    return allPossibeStringCombinations


def getPositivesAndNegatives(allPossibeStringCombinations):
    positive = []
    negative = []
    for s,i,j in allPossibeStringCombinations:
        if s.startswith('[[') and s.endswith(']]'):
            if ("[" not in s[2:-2]) and ("]" not in s[2:-2]):
                positive.append([s,i,j])
            else:
                negative.append([s,i,j])
        else:
            negative.append([s,i,j])
    return positive,negative

def getFeature1FirstWordCapital(token):
    #[word, start, end]
    # checks if first word of every word in token is capital
    feature = 1
    for word in token[0].split():
        word = word.replace("[[", '')
        word = word.replace("]]", '')
        feature = feature & word[0].isupper()
    return feature

def getFeature2PreSuffixWordCapital(token, wordList):
    #check if words either prev or after have capital letters
    # flaky - "Tom Cruise does" - false positive for 'does'
    # maybe helps to learn something
    feature = 0
    if token[1] > 0:
        cmpWord = wordList[token[1] - 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        feature |= cmpWord[0].isupper()
        
    if token[2] < (len(wordList) - 1):
        cmpWord = wordList[token[2] + 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        feature |= cmpWord[0].isupper()
    return feature

def getFeature3TokenLength(token):
    return len(token[0].split())

def getFeature4ProbPreSuff(token, wordList):
    feature = 0
    if token[1] > 0:
        cmpWord = wordList[token[1] - 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        if cmpWord.lower() in [ig.lower() for ig in constants.positivePrefixSuffixList]:
            feature |= 1
    
    if token[2] < (len(wordList) - 1):
        cmpWord = wordList[token[2] + 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        if cmpWord.lower() in [ig.lower() for ig in constants.positivePrefixSuffixList]:
            feature |= 1
    return feature
    
#check if want to normalize it in some way
def getFeature5TokenHash(token):
    #http://cseweb.ucsd.edu/~kube/cls/100/Lectures/lec16/lec16-16.html
    hashVal = 0
    for char in token[0]:
        hashVal = (hashVal << 4) + ord(char)
        g = hashVal & 0xF0000000
        if g != 0:
            hashVal = hashVal ^ (g >> 24)
        hashVal = hashVal & ~g
    return hashVal

def getFeature6OneHotVector(token):
    #separate 26-26 for caps and lower case
    charDictCaps = {chr(i) : 0 for i in range(65,91)}
    charDictSmall = {chr(i) : 0 for i in range(97, 123)}
    for char in token[0]:
        if char in charDictCaps:
            charDictCaps[char] += 1
        elif char in charDictSmall:
            charDictSmall[char] += 1
    
    charIdxCaps = {key : i for i,key in enumerate(charDictCaps.keys())}
    charIdxSmall = {key : (i+26) for i,key in enumerate(charDictSmall.keys())}

    OHvector = np.zeros((1,52))
    for key in charDictCaps.keys():
        OHvector[0,charIdxCaps[key]] = charDictCaps[key]
    for key in charDictSmall.keys():
        OHvector[0,charIdxSmall[key]] = charDictSmall[key]
    
    return OHvector

In [8]:
folderNames = ['Abhinav','Bidyut','Chirayu']
folderPath = '../dataset_markup/'
totalMarkups = 0
totalUniqueMarkups = set()
p_total =0
n_total = 0
for fileName in commonUtils.getAllFiles(folderNames,folderPath):
    wordList = getCleanedWordListFromFile(fileName)
    l = getStringCombinationsFromWordList(wordList)
    p,n = getPositivesAndNegatives(l)
    p_total+=len(p)
    n_total+=len(n)
    
print(p_total,n_total)


3950 81967
