In [1]:
import os
from os.path import isfile, join
import logging
import re
import commonUtils
import constants
import numpy as np
import random

In [2]:
constants.positivePrefixSuffixList

{'boss',
 'captain',
 'champion',
 'executive',
 'manager',
 'refree',
 'said',
 'skipper'}

In [3]:
def getCleanedWordListFromFile(fileName):
    wordlist = []
    fileText = open(fileName).read()
    for line in fileText.splitlines():
        for pattern in constants.nullReplaceList:
            line = line.replace(pattern,'')
        for pattern in constants.spaceReplaceList:
            line = line.replace(pattern,' ')
        for word in line.split(' '):
            word = word.strip('\'')
            word = word.replace("[[[","[[")
            word = word.replace("]]]","]]")
            if(len(word)>=1):
                wordlist.append(word)
    return wordlist

def getStringCombinationsFromWordList(wordlist):
    allPossibeStringCombinations = []
    for i in range(0,len(wordlist)):
        currString = ""
        for j in range(0,3):
            if(i+j>=len(wordlist)):
                break
            if wordlist[i+j].lower() in [ig.lower() for ig in constants.wordsToIgnoreList]:
                break
            if any(char.isdigit() for char in wordlist[i+j]) :
                break
            if j > 0:
                currString+=' '
            currString+= wordlist[i+j]
            if len(currString.strip(' ')) >=1:
                allPossibeStringCombinations.append([currString.strip(' '),i,i+j])
    return allPossibeStringCombinations


def getPositivesAndNegatives(allPossibeStringCombinations):
    positive = []
    negative = []
    for s,i,j in allPossibeStringCombinations:
        if s.startswith('[[') and s.endswith(']]'):
            if ("[" not in s[2:-2]) and ("]" not in s[2:-2]):
                positive.append([s,i,j])
            else:
                negative.append([s,i,j])
        else:
            negative.append([s,i,j])
    return positive,negative

def getFeature1FirstWordCapital(token):
    #[word, start, end]
    # checks if first word of every word in token is capital
    feature = 1
    for word in token[0].split():
        word = word.replace("[[", '')
        word = word.replace("]]", '')
        feature = feature & word[0].isupper()
    return feature

def getFeature2PrefixWordCapital(token, wordList):
    #check if words either prev or after have capital letters
    # flaky - "Tom Cruise does" - false positive for 'does'
    # maybe helps to learn something
    feature = 0
    if token[1] > 0:
        cmpWord = wordList[token[1] - 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        feature |= cmpWord[0].isupper()
        
    if token[2] < (len(wordList) - 1):
        cmpWord = wordList[token[2] + 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        feature |= cmpWord[0].isupper()
    return feature

def getFeature7SuffixWordCapital(token, wordList):
    #check if words either prev or after have capital letters
    # flaky - "Tom Cruise does" - false positive for 'does'
    # maybe helps to learn something
    feature = 0
        
    if token[2] < (len(wordList) - 1):
        cmpWord = wordList[token[2] + 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        feature |= cmpWord[0].isupper()
    return feature

def getFeature8Prefix2WordCapital(token, wordList):
    #check if words either prev or after have capital letters
    # flaky - "Tom Cruise does" - false positive for 'does'
    # maybe helps to learn something
    feature = 0
    if token[1] > 1:
        cmpWord = wordList[token[1] - 2].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        feature |= cmpWord[0].isupper()

    return feature

def getFeature9Suffix2WordCapital(token, wordList):
    #check if words either prev or after have capital letters
    # flaky - "Tom Cruise does" - false positive for 'does'
    # maybe helps to learn something
    feature = 0
        
    if token[2] < (len(wordList) - 2):
        cmpWord = wordList[token[2] + 2].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        feature |= cmpWord[0].isupper()
    return feature

def getFeature3TokenLength(token):
    return len(token[0].split())

def getFeature4ProbPreSuffix(token, wordList):
    feature = 0
    if token[1] > 0:
        cmpWord = wordList[token[1] - 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        if cmpWord.lower() in [ig.lower() for ig in constants.positivePrefixSuffixList]:
            feature |= 1
    
    if token[2] < (len(wordList) - 1):
        cmpWord = wordList[token[2] + 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        if cmpWord.lower() in [ig.lower() for ig in constants.positivePrefixSuffixList]:
            feature |= 1
    return feature

def getFeature10PrefixWordLength(token, wordList):
    #check if words either prev or after have capital letters
    # flaky - "Tom Cruise does" - false positive for 'does'
    # maybe helps to learn something
    feature = 0
    if token[1] > 0:
        cmpWord = wordList[token[1] - 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        feature = len(cmpWord)
        
    return feature

def getFeature11SuffixWordLength(token, wordList):
    #check if words either prev or after have capital letters
    # flaky - "Tom Cruise does" - false positive for 'does'
    # maybe helps to learn something
    feature = 0
        
    if token[2] < (len(wordList) - 1):
        cmpWord = wordList[token[2] + 1].replace("[[", '')
        cmpWord = cmpWord.replace("]]", '')
        feature = len(cmpWord)
    return feature

#check if want to normalize it in some way
def getFeature5TokenHash(token):
    #http://cseweb.ucsd.edu/~kube/cls/100/Lectures/lec16/lec16-16.html
    hashVal = 0
    word = token[0].replace("[[", '')
    word = word.replace("]]", '')
    for char in word:
        hashVal = (hashVal << 4) + ord(char)
        g = hashVal & 0xF0000000
        if g != 0:
            hashVal = hashVal ^ (g >> 24)
        hashVal = hashVal & ~g
    return hashVal

def getFeature6OneHotVector(token):
    #separate 26-26 for caps and lower case
    charDictCaps = {chr(i) : 0 for i in range(65,91)}
    charDictSmall = {chr(i) : 0 for i in range(97, 123)}
    for char in token[0]:
        if char in charDictCaps:
            charDictCaps[char] += 1
        elif char in charDictSmall:
            charDictSmall[char] += 1
    
    charIdxCaps = {key : i for i,key in enumerate(charDictCaps.keys())}
    charIdxSmall = {key : (i+26) for i,key in enumerate(charDictSmall.keys())}

    OHvector = np.zeros((1,52))
    for key in charDictCaps.keys():
        OHvector[0,charIdxCaps[key]] = charDictCaps[key]
    for key in charDictSmall.keys():
        OHvector[0,charIdxSmall[key]] = charDictSmall[key]
    
    return OHvector

In [8]:
# pre-split data check
folderNames = ['Abhinav','Bidyut','Chirayu']
folderPath = '../dataset_markup/'
totalMarkups = 0
totalUniqueMarkups = set()
p_total =0
n_total = 0
for fileName in commonUtils.getAllFiles(folderNames,folderPath):
    wordList = getCleanedWordListFromFile(fileName)
    l = getStringCombinationsFromWordList(wordList)
    p,n = getPositivesAndNegatives(l)
    p_total+=len(p)
    n_total+=len(n)
    
print(p_total,n_total)

3950 81967


In [20]:
# #randomly separate 200 train and 100 test files
# folderNames = ['documentPool']
# folderPath = '../stage1/'
# fileList = commonUtils.getAllFiles(folderNames,folderPath)

# import random
# import shutil
# #twice random only to introduce more randomness
# corpusIdx = [i for i in range(0,len(fileList))]
# testPool = random.sample(corpusIdx, 100)
# testPool = random.sample(corpusIdx, 100)
# trainPool = set(corpusIdx).symmetric_difference(set(testPool))

# for idx in trainPool:
#     shutil.copy(fileList[idx], '../stage1/train')
    
# for idx in testPool:
#     shutil.copy(fileList[idx], '../stage1/test')

In [5]:
#create train, test db
arrColumns = 58 + 1 + 2 + 2

def getFeatureVector(token, wordList):
    featureVector = np.zeros((1,arrColumns))
    featureVector[0,0] = getFeature1FirstWordCapital(token)
    featureVector[0,1] = getFeature2PrefixWordCapital(token, wordList)
    featureVector[0,2] = getFeature7SuffixWordCapital(token, wordList)
    featureVector[0,3] = getFeature8Prefix2WordCapital(token, wordList)
    featureVector[0,4] = getFeature9Suffix2WordCapital(token, wordList)
    featureVector[0,5] = getFeature3TokenLength(token)
    featureVector[0,6] = getFeature10PrefixWordLength(token, wordList)
    featureVector[0,7] = getFeature11SuffixWordLength(token, wordList)
    
    featureVector[0,8] = getFeature4ProbPreSuffix(token, wordList)
    featureVector[0,9] = getFeature5TokenHash(token)
    
    
    featureVector[0,10:62] = getFeature6OneHotVector(token)
    return featureVector

def getMetaData(token, wordList, fileName, l):
    meta = []
    meta.append(token)
    w = ''
    if token[1] > 0:
        w = wordList[token[1] - 1]
    meta.append(w)
    w = ''
    if (token[2] < (len(wordList) - 1)):
        w = wordList[token[2] + 1]
    meta.append(w)
    meta.append(fileName)
    meta.append(l)
    return meta
            
folderNames = ['train', 'test']
folderPath = '../stage1/'
for folderName in folderNames:
    temp = [folderName]
    fileList = commonUtils.getAllFiles(temp,folderPath)
    p_total = 0
    n_total = 0

    # token, prev, after, fileName
    metaData = []
    trainArray = np.zeros((1000,arrColumns))
    arrayIndex = 0

    for fileName in fileList:
        wordList = getCleanedWordListFromFile(fileName)
        l = getStringCombinationsFromWordList(wordList)
        p,n = getPositivesAndNegatives(l)

        for data in p:
            meta = []
            p_total += 1
            # 1 for pos
            featureVector = getFeatureVector(data, wordList)
            featureVector[0,-1] = 1
            if arrayIndex > (trainArray.shape[0] - 1):
                trainArray.resize((trainArray.shape[0]+1000, arrColumns))

            trainArray[arrayIndex, :] = featureVector
            arrayIndex += 1
            metaData.append(getMetaData(data, wordList, fileName, 1))

        for data in n:
            # randomly drop some negative samples
            if True:#random.choice([True, False]) or folderName == 'test':
                n_total += 1
                featureVector = getFeatureVector(data, wordList)
                # 0 for negative
                featureVector[0,-1] = 0
                if arrayIndex > (trainArray.shape[0] - 1):
                    trainArray.resize((trainArray.shape[0]+1000, arrColumns))

                trainArray[arrayIndex, :] = featureVector
                arrayIndex += 1
                metaData.append(getMetaData(data, wordList, fileName, 0))

    trainArray = trainArray[0:p_total+n_total, :]
    sfl = np.arange(trainArray.shape[0]) 
    np.random.shuffle(sfl)
    trainArray = trainArray[sfl]
    metaSfl = []
    for idx in sfl:
        metaSfl.append(metaData[idx])

    print(p_total,n_total)
    print (trainArray.shape)

    npArrFile = folderName + '.npy'
    np.save(os.path.join(folderPath, npArrFile), trainArray)
    
    #write meta data to text file
    metaFile = folderName + 'MetaData.txt'
    fp = open(os.path.join(folderPath, metaFile), 'w')
    for line in metaSfl:
        fp.write(str(line))
        fp.write('\n')
    fp.close()

2719 28297
(31016, 63)
1217 13574
(14791, 63)
