In [None]:
#Switch between 2 file sets below by commenting/uncommenting

# Comparing King James Bible to Common English Bible
eng1file = "a1_KingJamesBible_English1.txt"
spa1file = "a1_KingJamesBible_Spanish1.txt"
eng2file = "a2_CommonEnglishBible_English2.txt"

# # Comparing Universal Declaration of Human Rights to International Covenant on Economic, Social and Cultural Rights 
# eng1file = "b1_UDHR_English1.txt"
# spa1file = "b1_UDHR_Spanish1.txt"
# eng2file = "b2_ICESCR_English2.txt"


#print names of imported files
print("Files imported:")
print("English 1:",eng1file,"\nSpanish 1:",spa1file,"\nEnglish 2:",eng2file)

import re

def readFile(file):
    inFile = open(file,'r')
    #read in lines
    file = inFile.readlines()
    inFile.close()
    
    new = []
    for x in file:
        #exclude lines that are all white spaces
        if re.search('^\s*$',x):
            continue
        else:
            new.append(x)
    
    #clean up the final lines
    cleaned = []
    for string in new:
        #trim leading and trailing spaces
        st = string.strip()
        #replace newline and tab characters with a space
        res = re.sub('\n|\t',' ',st)    
        cleaned.append(res)
    return cleaned

#check that English and Spanish have same number of paragraphs
def validate(eng,spa):
    if len(eng) == len(spa):
        print("\nSuccess! Files aligned.")
        #print("number of paragraphs:",len(eng))
        return True
    else:
        print("Files do not have the same number of paragraphs. Please fix.")

#remove punctuation/brackets, make lowercase, split into words (to be used for sets)
def clean(text):
    res = re.sub('[\"!:,.–“”[\]\)\(/]|[0-9]','',text)
    res = res.lower()
    res = res.split()
    return res

#align English 1 and Spanish 1 by paragraph index numbers
def align(eng,spa):
    pars = []
    current = []
    i = 0
    while i < len(eng):        
        current.append(eng[i])
        current.append(spa[i])
        pars.append(current)   
        current = []
        i += 1
    #add cleaned English 1 word set to list (as 3rd item)
    for p in pars:
        #use clean function to clean/normalize
        cl = clean(p[0])
        #turn clean English 1 words into a set
        consolidate = set(cl)
        #append as 3rd item to list, so we have [0] English lines, [1] Spanish lines, [2] clean-lowercase English word set
        p.append(consolidate)
    return pars

def printPreview(a):
    print("\nHere is a preview of the aligned text:\n***************************")
    count = 0
    for pars in a:
        print("English:\t",pars[0])
        print("Spanish:\t",pars[1])
        print("***************************")
        count += 1
        if count == 2:
            break

# For English 2 file, create list with [0] English lines, and [1] clean-lowercase English word set
def makeKeyWords(inputlist):
    final = []
    for line in inputlist:
        current = []
        current.append(line)
        #run clean function to remove punctuation, make lowercase, split
        nrm = clean(line)
        #make the set
        consolidate = set(nrm)
        current.append(consolidate)
        final.append(current)
        current = []
    return final

def compare(aligned,neweng):  
    print("\nNow we'll compare paragraphs in English 1 and English 2 to look for overlapping words.")
    print("If we find overlap, we'll print the related Spanish 1 paragraph.\n")
    print("We'll use an 'overlap score' for the English paragraph comparison defined as follows:")
    print("[# of intersecting words in English 1 & 2 word sets] / [Total # words in English 2 word set]")
    
    #allow user to enter the overlap score they want to test
    test = False
    while test == False:
        n = input("Enter number from 1-100 to set mimimum overlap score: ")
        #make sure they enter it in the correct format
        correct = re.search("^[1-9][0-9]?$|^100$",n)
        if correct:
            test = True
            minimum = float(int(n)/100)
        else:
            print("You did not enter a number from 1-100. Please try again")
    print("\nMatches:\n")
    print("***************************")
    
    v2count = 0

    #loop through English 2
    for eng2pars in neweng:
        
        #to keep track and display paragraph numbers
        v1count = 0
        v2count += 1 
        
        #While loop to compare against aligned set
        while v1count < len(aligned):
            
            #find intersection between English 1 and English 2 word sets
            matched_words = eng2pars[1] & aligned[v1count][2]           
            
            d = len(eng2pars[1]) #length of English 2 word set
            g = len(matched_words) #number of intersecting words
            
            if d > 0: #avoid division by zero
                
                # Show matches only if they meet the overlap threshold
                # also only if English 2 word set is greater than 2 
                # (otherwise we get too many matches for things like "chapter x" / "article x")
                if g/d >= minimum and d > 2:
                    
                    #print the results
                    print("English 2, paragraph ",v2count,":",sep="")
                    print(eng2pars[0])

                    print("\nEnglish 1, paragraph ",v1count+1," (match):",sep="")               
                    print(aligned[v1count][0],"\n")

                    print("Spanish 1:")
                    print(aligned[v1count][1],"\n")

                    print("Overlapping words:",g,matched_words)
                    print("English 2 Word Set Length:",d)
                    print("Overlap score:",(g/d)*100,"\n") 
                    print("**************************************")

                v1count += 1
          
#read in English and Spanish translation     
eng1 = readFile(eng1file)
spa1 = readFile(spa1file)

#check if English and Spanish can be aligned
valid = validate(eng1,spa1)

if valid:
    #align, create English 1 word set, preview alignment
    engSpa = align(eng1,spa1) 
    printPreview(engSpa)
    
    #read in 2nd (new) English file and make word sets
    eng2 = readFile(eng2file)
    eng2set = makeKeyWords(eng2)
    
    #run comparison
    runit = compare(engSpa,eng2set)


Files imported:
English 1: a1_KingJamesBible_English1.txt 
Spanish 1: a1_KingJamesBible_Spanish1.txt 
English 2: a2_CommonEnglishBible_English2.txt

Success! Files aligned.

Here is a preview of the aligned text:
***************************
English:	 Chapter 1
Spanish:	 Capítulo 1
***************************
English:	 In the beginning God created the heaven and the earth.
Spanish:	 EN el principio crió Dios los cielos y la tierra.
***************************

Now we'll compare paragraphs in English 1 and English 2 to look for overlapping words.
If we find overlap, we'll print the related Spanish 1 paragraph.

We'll use an 'overlap score' for the English paragraph comparison defined as follows:
[# of intersecting words in English 1 & 2 word sets] / [Total # words in English 2 word set]
Enter number from 1-100 to set mimimum overlap score: 80

Matches:

***************************
English 2, paragraph 7:
4 God saw how good the light was. God separated the light from the darkness.

Englis