In [19]:
from docx import Document
import os
import re
import csv
import sys

In [20]:
'''
Author: Eden

Read stopwords from files and return a stopwords set.

-filename = file name

return the set of stopwords
'''
def read_stop_words(filename):
    vector = []
    with open(filename, 'r') as f:
        input = f.read().splitlines()

    [vector.append(input[i]) for i in range(len(input))]   
    return set(vector)

In [21]:
'''
Author: Zoe

Remove the stopwords such as articles, prep and so on from keywords

-keywords = search word

return the string without stopwords
'''

def remove_stopwords(keywords, stop_set):
    word_list = re.split(r'\s',keywords)
    filtered_words = [word for word in word_list if word not in stop_set]
    words = " ".join(filtered_words)
    return words

### indent(para) Example
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Change of control means that (a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or (b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Code shall mean the Internal Revenue Code of 1986, as the same may be amended or supplemented from time to time, and any successor statute of similar import, and the rules and regulations thereunder, as from time to time in effect.

In [22]:
'''
Author: Yufei

Get the indent space of the paragraph, used in find_patterns and match function

-para = paragraph

return a num means the indent space or null if no indent
'''
def indent(para):
    if para.paragraph_format.first_line_indent:
        return para.paragraph_format.first_line_indent.pt
    else:
        return None

### bullet_list(para) Example
Change of control means that 

&nbsp;&nbsp;&nbsp;&nbsp;(a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or 

&nbsp;&nbsp;&nbsp;&nbsp;(b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

#### The difference between bullet_list and isListPara is that the bullet in the bullet_list is manually input rather than the Microsoft Word style


In [23]:
'''
Author: Eden

Check if the paragraph is bullet ordered list, used in find_patterns and match function

-para = paragraph

return the regex pattern of this paragraph or None
'''
def bullet_list(para):
    pattern = '[·]'
    flag = re.match(pattern, para.text.lstrip()[:10])      
    if flag:
        return pattern
    
    pattern = u'[\u2022]'
    flag = re.match(pattern, para.text.lstrip()[:10])      
    if flag:
        return pattern
    
    pattern = '[A-Z]+\)'
    flag = re.match(pattern, para.text.lstrip()[:10])      
    if flag:
        return pattern
    
    pattern = '[a-z]+\)'
    flag = re.match(pattern, para.text.lstrip()[:10])
    if flag:
        return pattern
    
    pattern = '\d+\)'
    flag = re.match(pattern, para.text.lstrip()[:10])
    if flag:
        return pattern
    
    pattern = '\d+\.[^\d]+'
    flag = re.match(pattern, para.text.lstrip()[:10])
    if flag:
        return pattern
    
    pattern = '[a-z]+\.'
    flag = re.match(pattern, para.text.lstrip()[:10])
    if flag:
        return pattern
    
    pattern = '[A-Z]+\.'
    flag = re.match(pattern, para.text.lstrip()[:10])
    if flag:
        return pattern
    
    
    pattern = '\d+\.\d+[^\d]+'
    flag = re.match(pattern, para.text.lstrip()[:10])
    if flag:
        return pattern
    
    pattern = '\d+\.\d+\.\d+[^\d]+'
    flag = re.match(pattern, para.text.lstrip()[:10])
    if flag:
        return pattern
    
    pattern = '\d+\.\d+\.\d+\.\d+[^\d]+'
    flag = re.match(pattern, para.text.lstrip()[:10])
    if flag:
        return pattern
    
    pattern = '[\(]?([vx]|i{1,3}|i[vx]|[vx]i{1,3})(\.|\))'
    flag = re.match(pattern, para.text.lstrip()[:10])
    if flag:
        return pattern
    
    return None

In [24]:
# '''
# Testing Function
# '''

# # Input folder path
# directory_path = os.getcwd() + '/input/change_of_control+change_in_control' 
# fileName = 'test.docx'
# document = Document('/Users/yangyi/Desktop' + "/" +fileName)
#     #
# for p in document.paragraphs:
# #     print p.text
#     pattern = bullet_list(p)
#     if pattern:
#         print pattern
#         print p.text

### isListPara(para) Example
1. any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or 
2. a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

In [25]:
'''
Author: Alan

Check if the paragraph is list paragraph rather than normal paragraph, used in find_patterns and match function

-para = paragraph

return True/False
'''
def isListPara(para):
    paraFormat = para.style.name
    if paraFormat == 'List Paragraph':
        return True
    return False

### isNormal(para) Example
any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or 

In [26]:
'''
Author: Alan

Check if the paragraph is Normal rather than list paragraph，used in match function

-para = paragraph

return True/False
'''
def isNormal(para):
    paraFormat = para.style.name
    if paraFormat == 'Normal':
        return True
    return False

### isNormalFollwingListPara(paras, curIndex) Example
Change of control means that 
1. any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or 
2. a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

In [27]:
'''
Author: Alan

Check if the paragraph followed by list paragraph, used in find_patterns function

-paras = all the paragraph
-curIndex = the index of current paragraph

return True/False
'''
def isNormalFollowingListPara(paras, curIndex):
    targetFormat = paras[curIndex].style.name
    if curIndex == len(paras) - 1:
        return False
    nextFormat = paras[curIndex+1].style.name
    if nextFormat == 'List Paragraph' and targetFormat == 'Normal':
        return True
    return False

### bold(para) Example
**Change of control** means that (a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or (b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

**Code** shall mean the Internal Revenue Code of 1986, as the same may be amended or supplemented from time to time, and any successor statute of similar import, and the rules and regulations thereunder, as from time to time in effect.

In [63]:
'''
Author: Zheng

Check if the bold font in the paragraph, used in match function

-para = paragraph

return True/False
'''

def bold(para):
    paras_toPrint = []
    i=0
    for run in para.runs:
        if i>3:
            break
        i=i+1
        if run.bold:
            paras_toPrint.append(run.text) 
    if len(paras_toPrint) == 0:
        return False
    else:
        return True

### is_bold(para, keywords) Example
**Change of control** means that (a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or (b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

In [62]:
'''
Author: Zheng

Check if the bold font and keyword in the paragraph, used in find_patterns function

-para = paragraph
-keywords = search words

return True/False
'''

def is_bold(para, keywords):
    paras_toPrint = []
    i=0
    if keywords.lower() in para.text.lower(): 
        for run in para.runs:
            if i>1:  
                break
            i=i+1
            if run.bold:
                paras_toPrint.append(run.text)
    if len(paras_toPrint) == 0:
        return False
    else:
        return True

### is_italic(para, keywords) Example
_Change of control_ means that (a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or (b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

In [61]:
'''
Author: Zheng

Check if the italic font and keywords in the paragraph, used in match function

-para = paragraph

return True/False
'''
def is_italic(para, keywords):
    paras_toPrint = []
    i=0
    if keywords.lower() in para.text.lower(): 
        for run in para.runs:
            if i>1:  
                break
            i=i+1
            if run.italic:
                paras_toPrint.append(run.text)
    if len(paras_toPrint) == 0:
        return False
    else:
        return True

### italic(para) Example
_Code_ shall mean the Internal Revenue Code of 1986, as the same may be amended or supplemented from time to time, and any successor statute of similar import, and the rules and regulations thereunder, as from time to time in effect.

In [60]:
'''
Author: Zheng

Check if the italic font in the paragraph, used in match function

-para = paragraph

return True/False
'''

def italic(para):
    paras_toPrint = []
    i=0
    for run in para.runs:
        if i>3:
            print i,'hello'
            break
        i=i+1
        if run.italic:
            paras_toPrint.append(run.text) 
    if  len(paras_toPrint)==0:
        return False
    else:
        return True

### is_underline(para, keywords) Example
Change of control(Underline) means that (a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or (b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

In [59]:
# Zheng
'''
Author: Zheng

Check if the keyword is underlined in the paragraph, used in find_pattern function

-para = paragraph
-keywords = keywords

return True/False
'''

def is_underline(para, keywords):
    paras_toPrint = []
    i=0
    if keywords.lower() in para.text.lower(): 
        for run in para.runs:
            if i>1:  
                break
            i=i+1
            if run.underline:
                paras_toPrint.append(run.text)
    if len(paras_toPrint) == 0:
        return False
    else:
        return True

### underline(para) Example

Code(Underline) shall mean the Internal Revenue Code of 1986, as the same may be amended or supplemented from time to time, and any successor statute of similar import, and the rules and regulations thereunder, as from time to time in effect.

In [58]:
'''
Author: Zheng

Check if the underline in the paragraph, used in match function

-para = paragraph

return True/False
'''

def underline(para):
    paras_toPrint = []
    i=0
    for run in para.runs:
        if i>3:
            break
        i=i+1
        if run.underline:
            paras_toPrint.append(run.text) 
    if len(paras_toPrint) == 0:
        return False
    else:
        return True

### is_double_quotes(para, keywords) Example:
"Change of control" means that (a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or (b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

In [34]:
'''
Author: Zoe

Check if the keyword is double quoted in the paragraph, used in find_patterns function

-para = paragraph

return True/False
'''
def is_double_quotes(para, keywords, stop_set):
    words = re.split(u'[\"\u201c\u201d]',para.text)
    removed_words = remove_stopwords(keyword, stop_set).split(" ")
    if len(words) == 1:  # no double quotes
        return False
    # check if all keywords are in words[1], the first phrase in double quotes, and words[0] is not regular phrase which means that words[1] is the start of the paragragh
    if all(word in words[1] for word in removed_words) and len(words[0].rstrip().lstrip()) < 7 and re.match(r'[a-zA-Z ]*[a-zA-Z]+[a-zA-Z ]*',words[0]) == None:
        return True
    return False

### double_quotes(para) Example:

"Code" shall mean the Internal Revenue Code of 1986, as the same may be amended or supplemented from time to time, and any successor statute of similar import, and the rules and regulations thereunder, as from time to time in effect.


In [35]:
'''
Author: Zoe

Check if the double quotes in the paragraph, used in match function

-para = paragraph

return True/False
'''
def double_quotes(para):
    """
        Find if paragraph starts with double quotes
        
        return: True or False
    """
    words = re.split(u'[\"\u201c\u201d]',para.text)
    #print words
    if  len(words) == 1:
        return False
    # if words[0] is not regular phrase, we regard  words[1] as the start of the paragragh
    if len(words[0].rstrip().lstrip()) < 7 and re.match(r'[a-zA-Z ]*[a-zA-Z]+[a-zA-Z ]*',words[0]) == None:
        return True       
    else:
        return False

### is_single_quotes(para, keywords) Example:
'Change of control' means that (a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or (b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.


In [36]:
'''
Author: Zoe

Check if the keyword is single quoted in the paragraph, used in find_patterns function

-para = paragraph

return True/False
'''
def is_single_quotes(para, keywords, stop_set):
    words = re.split(u'[\"\u2018\u2019]',para.text)
    removed_words = remove_stopwords(keyword, stop_set).split(" ")
    #re.findall(r'[\"\u201C\u201D].*[\"\u201C\u201D]', para.text)
    #print words
#     print '\tIS Double Quotes', words
    if len(words) == 1:    # no single quotes
        return False
        # check if all keywords are in words[1], the first phrase in single quotes, and words[0] is not regular phrase which means that words[1] is the start of the paragragh
    if all(word in words[1] for word in removed_words) and len(words[0].rstrip().lstrip()) < 7 and re.match(r'[a-zA-Z ]*[a-zA-Z]+[a-zA-Z ]*',words[0]) == None:
        return True
    return False

### single_quotes(para) Example:

'Code' shall mean the Internal Revenue Code of 1986, as the same may be amended or supplemented from time to time, and any successor statute of similar import, and the rules and regulations thereunder, as from time to time in effect.


In [37]:
'''
Author: Zoe

Check if single quotes in the paragraph, used in match function

-para = paragraph

return True/False
'''
def single_quotes(para):
    words = re.split(u'[\"\u2018\u2019]',para.text)
    if len(words) == 1:
        return False
    # if words[0] is not regular phrase, we regard words[1] as the start of the paragragh
    if len(words[0].rstrip().lstrip()) < 7 and re.match(r'[a-zA-Z ]*[a-zA-Z]+[a-zA-Z ]*',words[0]) == None:
        return True       
    else:
        return False

### is_upper_case(para, keywords) Example:
CHANGE OF CONTROL shall mean the Internal Revenue Code of 1986, as the same may be amended or supplemented from time to time, and any successor statute of similar import, and the rules and regulations thereunder, as from time to time in effect.


In [38]:
'''
Author: Zoe

Check if the keyword is upper case in the paragraph, used in find_patterns function

-para = paragraph

return True/False
'''
def is_upper_case(para, keywords):
    words = re.findall(r'([A-Z]+?)\s',para.text)
    if not words:
        return False
    if para.text.startswith(words[0]): # and words[0].lower() == keyword.lower():
        return True
    else:
        return False

### upper_case(para) Example:

UPPERCASE shall mean the Internal Revenue Code of 1986, as the same may be amended or supplemented from time to time, and any successor statute of similar import, and the rules and regulations thereunder, as from time to time in effect.


In [39]:
'''
Author: Zoe

Check if upper case word in the paragraph, used in match function

-para = paragraph

return True/False
'''
def upper_case(para):
    words = re.findall(r'([A-Z]+?)\s',para.text)
    if not words:
        return False
    if para.text.startswith(words[0]):
        return True
    else:
        return False

#### Remove_Stopwords Test

In [40]:
'''
Testing Function
'''
directory_path = 'res' 
fileName = 'stop_words.txt'
stop_set = read_stop_words(directory_path + '/' +  fileName)
keyword = "Change of Control"
print remove_stopwords(keyword, stop_set)

Change Control


In [57]:
'''
Author: Eden

Still Testing

Traversing through each of the paragraphs and find the paragraph containing the keywords

-paras = all paragraphs
-keywords = search words

return a list of paragraph index 
'''
def search_2(paras, keyword):
    index = []
    i = 0
    target = remove_stopwords(keyword.lower(), stop_set)
    while (i<len(paras)):
        p = paras[i]
        if contains_sliding_window(p.text.lower(), target, 2 * len(keyword)):
            print 'test3'
            index.append(i)
        i = i+1
    return index

In [56]:
'''
Author: Eden

Still Testing

Find the keywords by sliding windows so that even the order of keywords change, we can still fetch the results

-text = all paragraphs
-keywords = search words
-win_len = the length of sliding window

return True if the keyword is in text within a sliding window by random orders.
'''
def contains_sliding_window(text, keyword, win_len):
    target_words = re.split('[\s]', keyword)
    probe = target_words[0]
#     print probe
#     print text
    
    flag = False
    i = 0
    while (True):
        text = text[i:]
        if probe in text:
            i = text.index(probe)
            l_pointer = max(0,i - win_len)
            r_pointer = min(len(text),i + win_len)
            window_str = text[l_pointer:r_pointer]
            flag = True
            for target_word in target_words:
                if target_word not in window_str:
                    flag = False
            if flag:
                return True
        else:
            break

In [43]:
'''
Testing Function
'''
text = 'aaChange of Control means an event or series of events by which:'
if contains_sliding_window(text.lower(), 'change control', 2 * len(keyword)):
    print text + '1'

aaChange of Control means an event or series of events by which:1


### upper_camel_case(para, keywords) Example:
Capitalized Lease Obligations means, with respect to any Person, all outstanding obligations of such Person in respect of Capital Leases, taken at the capitalized amount thereof accounted for as indebtedness in accordance with GAAP.

Change of Control means that (a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or (b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.

Code shall mean the Internal Revenue Code of 1986, as the same may be amended or supplemented from time to time, and any successor statute of similar import, and the rules and regulations thereunder, as from time to time in effect.

In [44]:
'''
Author: Zoe and Zheng

Check if the words following upper camel rule in the paragraph, used in match function

-para = paragraph
-stop_set = stop words

return True/False
'''
def upper_camel_case(para, stop_set):
    words_list = re.split(" ", para.text)
    new_para = remove_stopwords(para.text, stop_set)
    new_para_list = re.split(" ", new_para)
    new_words_list = re.findall(r'([A-Z][a-z0-9\']+)[^A-Z]?\s', new_para)
    last_word = ""
    for i in range(len(new_words_list)):
        if new_words_list[i] == new_para_list[i]:
            last_word = new_words_list[i]
        else:
            break
    result = []
    for j in range(len(words_list)):
        if last_word == words_list[j]:
            result.append(last_word)
            break
        else:
            result.append(words_list[j])
    if len(result) > 1:
        return True
    else:
        return False

### is_upper_camel_case() Example:
Change of Control means that (a) any Person or group (within the meaning of Rule13d-5 of the SEC under the Exchange Act) shall become the Beneficial Owner of 20% or more of the Voting Equity Interests of the Borrower, or (b) a majority of the members of the Board of Directors of the Borrower shall cease to be Continuing Members.


In [55]:
'''
Author: Zoe and Zheng

Check if the keywords is following upper camel rule in the paragraph or not, used in find_pattern function

-para = paragraph
-keyword = keyword
-stop_set = stop words

return True/False
'''
def is_upper_camel_case(paragraph, keyword, stop_set):
    words_list = re.compile('\w+').findall(paragraph.text)
    new_paragraph = remove_stopwords(paragraph.text, stop_set)
    new_paragraph_list = re.compile('\w+').findall(new_paragraph)
    new_words_list = re.findall(r'([A-Z][a-z0-9\']+)[^A-Z]?\s', new_paragraph)

    last_word = ""
    for i in range(len(new_words_list)):
        if new_words_list[i] == new_paragraph_list[i]:
            last_word = new_words_list[i]
        else:
            break
    result = []
    for j in range(len(words_list)):
        if last_word == words_list[j]:
            result.append(last_word)
            break
        else:
            result.append(words_list[j])
    results = " ".join(result)
    if keyword.lower() in results.lower():
        return True
    else:
        return False

In [47]:
'''
Author: Yufei

Traversing through each of the paragraphs and find the paragraph containing the keywords

-paras = all paragraphs
-keywords = search words

return a list of paragraph index 
'''
def search(paras, keywords):
    index = []
    i = 0
    while (i<len(paras)):
        p = paras[i]
        if keywords.lower() in p.text.lower():
            index.append(i)
        i = i+1
    return index

In [48]:
'''
Author: Yufei

Find the pattern of the paragraph containing the keywords by call the is_* function

-paras = all paragraphs
-index = paragraph index
-keyword = search words

return True/False list, 
[bold, italic, underline, double_quotes, single_quotes, upper case, (upper camel case), indent space, list paragraph, normal following list paragraph, bullet list]

'''
def find_patterns(paras, index, keyword, stop_set):
    paragraph = paras[index]
    patterns = []
    patterns.append(is_bold(paragraph, keyword))
    patterns.append(is_italic(paragraph, keyword))
    patterns.append(is_underline(paragraph, keyword))
    patterns.append(is_double_quotes(paragraph, keyword, stop_set))
    patterns.append(is_single_quotes(paragraph, keyword, stop_set))
    patterns.append(is_upper_case(paragraph, keyword))
    patterns.append(is_upper_camel_case(paragraph, keyword, stop_set))
    patterns.append(indent(paragraph))
    patterns.append(isListPara(paragraph))
    patterns.append(isNormalFollowingListPara(paras, index))
    patterns.append(bullet_list(paragraph))
    return patterns

In [49]:
'''
Author: Yufei

Find the pattern of the paragraph containing the keywords by call the is_* function

-paras = all paragraphs
-targetPatterns = the target pattern
-startIndex = paragraph index

return content
'''
def match(paras, targetPatterns, startIndex):
    
    i = startIndex + 1
    match = []
    
    print '\tTarget Patterns: ' + str(parttern_translate(targetPatterns))
    
    
    while (i<len(paras)):
        patterns = []
        p = paras[i]
        

        patterns.append(bold(p))
        patterns.append(italic(p))
        patterns.append(underline(p))
        patterns.append(double_quotes(p))
        patterns.append(single_quotes(p))
        patterns.append(upper_case(p))
        patterns.append(upper_camel_case(p, stop_set))
        patterns.append(indent(p))
        patterns.append(isListPara(p))
        patterns.append(isNormalFollowingListPara(paras, i))
        patterns.append(bullet_list(p))
        
        print '\n\tNext Paragraph:\t{0}\n\n\t\t{1}'.format(str(parttern_translate(patterns)), p.text.encode("utf-8"))
        
        if patterns == targetPatterns:
            print 'Bingo Bingo Bingo Bingo'
            break
        i = i + 1
    for x in range(startIndex, i):
        match.append(paras[x])
        
    return match, i-1

In [50]:
'''
Author: Eden

return the list of patterns' name

-pattern = the pattern list with true/false

return list of patterns name
'''
def parttern_translate(pattern):
    list = []
    if pattern[0]:
        list.append('blod')
    if pattern[1]:
        list.append('italic')
    if pattern[2]:
        list.append('underline')
    if pattern[3]:
        list.append('double_quotes')
    if pattern[4]:
        list.append('single_quotes')
    if pattern[5]:
        list.append('upper_case')
    if pattern[6]:
        list.append('upper_camel_case')
    if pattern[7]:
        list.append(pattern[7])
    if pattern[8]:
        list.append('list_paragraph')
    if pattern[9]:
        list.append('normal_following_list_paragraph')
    if pattern[10]:
        list.append('bullet_list')
    return list
    

In [51]:
# # Abhay
# '''
# Open the output file in append mode and pull related language from the files in the input folder
# '''

# ########################################################################
# out_filepath = 'Output/'
# out_filename = 'Output_search.csv'
# out_path = out_filepath + out_filename
# ########################################################################


# header = ['File Name', 'Keywords', 'Change of Control'] # Column names in output file 

# with open(out_path, 'wb') as f:
#     wr = csv.writer(f)
#     wr.writerow(header)

# with open(out_path, 'ab') as f:
#     # Input folder path
#     directory_path = 'input' 
    
#     # Parse through each file in the input folder path
#     for fileName in os.listdir(directory_path):
#         print fileName
        
#         # Create a Document object of each of the files
#         #document = Document(directory_path + "/" +fileName)
#         document = Document('input/upper_camel_case+underline+after_list_paragraph.docx')
        
        
#         # A list to add all the paragraphs in the document
#         paras = []
        
#         '''
#         The following for loop is used to add all the paragraphs in the document to the paras list object
#         '''
#         for p in document.paragraphs:        
#             paras.append(p)
            
#         paras_number = len(paras) # Find the number of paragraphs in the document
        
#         paras_toPrint = [] # A list to add all the paragraphs relevant to 'Change of Control' language in this case
        
#         keyword = "Change of Control"
#         keywordindex = search(paras, keyword)
        
#         print('keywordIndex:', keywordindex)
        
#         results = []
#         result = ""
#         for index in keywordindex:
#             print 'Target Para', paras[index].text
#             target = find_patterns(paras, index, keyword)
            
#             if index == paras_number - 1:
#                 result = paras[index]
#             else:
#                 result = match(paras, target, index)
            
#             results.append(result)

#         wr = csv.writer(f)
#         wr.writerow([fileName, keyword, results])
        
# print("end")
# f.close()

In [54]:

'''
Main Function.

02/19/2018 
1. Modified the System output to make programmers easier to debug.
2. Modified the output format in .csv files
3. encode all output string to be utf-8
4. Added one function to output the patterns name rather than simple true/false
5. improve the robustness of the functions
'''

########################################################################
out_filepath = 'output/'
out_filename = 'Output_search.csv'
out_path = out_filepath + out_filename
########################################################################


header = ['File Name', 'Keywords', 'Content'] # Column names in output file 

with open(out_path, 'wb') as f:
    wr = csv.writer(f)
    wr.writerow(header)
    
with open(out_path, 'ab') as f:
    # Input folder path
    directory_path = 'input/change_of_control+change_in_control' 
    
    stop_set = read_stop_words('res/stop_words.txt')    
    
    # Parse through each file in the input folder path
    print 'Keyword: ' + 'Change of Control' + '\n'
    for fileName in os.listdir(directory_path):
        if fileName == '.DS_Store':
            continue
        if fileName.startswith('~$'):
            continue
        print '-------------------------------------------------------------------------'
        print 'File name: ' + fileName + '\n'
        
        # Create a Document object of each of the files
        document = Document(directory_path + "/" +fileName)
        # document = Document('/upper_camel_case+underline+after_list_paragraph.docx')
        
        
        # A list to add all the paragraphs in the document
        paras = []
        
        '''
        The following for loop is used to add all the paragraphs in the document to the paras list object
        '''
        for p in document.paragraphs:        
            paras.append(p)
            
        paras_number = len(paras) # Find the number of paragraphs in the document
        
        paras_toPrint = [] # A list to add all the paragraphs relevant to 'Change of Control' language in this case
        
        keyword = "Change of Control"
        keywordindex = search(paras, keyword)
        
#         print '\tkeywordIndex: ' + str(keywordindex)
#         print
        
        results = []
        result = ""
        endindex = -1
        for index in keywordindex:
            if not keyword.lower() in paras[index].text[:100].lower():
                results.append(paras[index])
                endindex = index
            print '\tTarget Paragraph: [{0}]\n\t\t{1}\n'.format(index, paras[index].text.encode("utf-8"))
            target = find_patterns(paras, index, keyword, stop_set)
            
            if index <= endindex:
                continue
            if index == paras_number - 1:
                result = paras[index]
            else:
                result, endindex = match(paras, target, index)
            
            results.append(result)

        wr = csv.writer(f)
        # Created a single list of all instances of 'Change of Control'
        # Ignored unrecognized ascii characters that throw an encoding error. Need improvement
        for result in results:
            if hasattr(result, '__iter__'):
                text =  [paras.text.encode('ascii','ignore') for paras in result]
                res = "\n".join(text)
            else:
                res = result.text.encode('ascii','ignore')
            
            wr.writerow([fileName, keyword, res])
        
        print '\n'
print("end")
f.close()

Keyword: Change of Control

-------------------------------------------------------------------------
File name: 6R4lR5Afx2UHpGwugAeZxx.docx



-------------------------------------------------------------------------
File name: 1Y0ki1eVSBHxQNmWIfj6Or.docx

	Target Paragraph: [19]
		a. Termination Due to a Change in Control. If (A) Employer (either Northrim BanCorp, Inc. or Northrim Bank) is subjected to a Change of Control (as defined in Section 5(f)(i)), and (B) either Employer or its assigns terminates Executive’s employment without Cause (either during the annual term of this Agreement or by refusing to extend this Agreement when the annual termination occurs every December 31) or Executive terminates his employment for Good Reason within 730 days of such Change of Control, then Employer shall pay Executive in a lump sum: (i) all Base Salary earned and all reimbursable expenses incurred under this Agreement through such termination date; (ii) an amount equal to two (2) times Execut


	Next Paragraph:	['underline', 'upper_camel_case', 'bullet_list']

		(ii) Repayments Upon Sales of Assets. Subject to prior application in accordance with the terms of any documentation governing any Senior Debt, unless otherwise agreed by the Required Noteholders, on the fifth Business Day following the receipt of Net Proceeds (Asset Sales) in an aggregate amount greater than $15,000,000 for each fiscal year of the Issuer other than sale of inventory in the ordinary course of business, the Notes shall be repaid in an amount equal to such Net Proceeds (Asset Sales), together with any accrued interest on the portion of the Notes repaid; provided, however, that no such repayment shall be required if the Issuer notifies the Noteholders on or before the date such repayment would otherwise be required under this Section 2.01(b)(ii) that the Issuer or its Subsidiaries intend to use any or all of such Net Proceeds (Asset Sales) to invest in capital assets or Investments in the business of th

	Target Paragraph: [87]
		3.3           Change of Control. If Executive terminates this Agreement or his employment with the Company for Good Reason or if Executive’s employment with the Company is terminated by the Company for any reason other than for Cause, including non-renewal of this Agreement by the Company, and such termination occurs during a Change of Control Period, the Company shall pay to Executive a change of control indemnity of: (i) the Severance Indemnity as defined in Section 3.2; and (ii) a lump-sum payment equal to one hundred percent (100%) of the higher of: (A) the greater of (x) Executive’s target bonus as in effect for the fiscal year in which the Change of Control occurs or (y) Executive’s target bonus as in effect for the fiscal year in which Executive’s termination of employment occurs; or (B) Executive’s actual bonus for performance during the calendar year prior to the calendar year during which the termination of employment occurs. For avoidance of doubt, 

In [53]:
os.getcwd()

'/Users/shenglan/Documents/Capstone/Code/Capstone-Project-Mylan'