In [2]:
import re, nltk
import numpy as np

In [3]:
with open("resources/summa.txt", encoding="utf8") as myfile:
    file1 = " ".join(line.rstrip() for line in myfile)
    
with open("resources/summa2.txt", encoding="utf8") as myfile:
    file2 = " ".join(line.rstrip() for line in myfile)

In [4]:
def clean(data):
    text = re.sub(r"[^a-zA-Z]+", " ", data)
    text = re.sub(r"^.*?(?=(QUESTION))", "", text)
    text = re.sub(r"(?<=(FOR EVER BLESSED Amen)).+", '', text)
    text = re.sub(r"ON FATE", "QUESTION", text)
    
    return text


def chapters_split(data):
    chapters = []
    chap = []

    cleaned = clean(data).split(" ")

    for i, word in enumerate(cleaned):

        if (word == "QUESTION") or (i==len(cleaned)-1):
            chapters.append(' '.join(chap))
            chap = []

        else:
            chap.append(word)

    chapters = [chap for chap in chapters[1:]]
    
    return chapters

In [37]:
def text_split(data, stopWords):
    text = re.sub(r"[^\w]+", " ", data).split(" ")
    
    words = [word.lower() for word in text if word not in stopWords and len(word) > 1 and word[0] != "'"]
                
    return words


def shingles(file1, k, stopWords):
    
    f1_words = text_split(file1, stopWords)
    
    f1_shingles = set(tuple(f1_words[i:i+k]) for i in range(len(f1_words)-k+1))
    
    return f1_shingles


def jaccard(f1, f2, k, stopWords):
    
    f1_shingles = shingles(f1, k, stopWords)
    f2_shingles = shingles(f2, k, stopWords)
    
    intersection = f1_shingles.intersection(f2_shingles)
    union = f1_shingles.union(f2_shingles)
        
    return 1 - len(intersection)/len(union)


In [6]:
stopWords = set(nltk.corpus.stopwords.words('english'))

# 1a

In [7]:
f1 = clean(file1)
f2 = clean(file2)

jaccard(f1, f2, 4, stopWords)

0.9959429683339193

# 1b

In [8]:
chapters = chapters_split(file1)
# stopWords = set()

In [9]:
JDDS = []
for i in range(len(chapters)-1):
    JDDS.append([])
    for j in range(i+1,len(chapters)):
        JD = jaccard(chapters[i], chapters[j], 7, stopWords)
        JDDS[i].append(JD)
#         print("Chapters {} and {} distance: {}".format(i, j, JD))

# Task2

In [66]:
def create_all_shingles(chapters, k, stopWords):
    all_set = set()
    
    for chap in chapters:
        all_set.update(shingles(chap, k, stopWords))
        
    return sorted(list(all_set))


def create_hash_family(all_shingles, n):
    length = len(all_shingles)
    
    perms = [np.random.permutation(length) for i in range(n)]
    
    return perms


def signatures(shingles1, shingles2, all_shingles, hash_family):
    
    signatures = np.zeros((len(hash_family), 2))
    
    for i, c_hash in enumerate(hash_family):
        s1_ready = s2_ready = False
        
        for j, index in enumerate(c_hash):
            if s1_ready == False and all_shingles[index] in shingles1:
                signatures[i,0] = j+1
                s1_ready = True
                
            if s2_ready == False and all_shingles[index] in shingles2:
                signatures[i,1] = j+1
                s2_ready = True
                
            if s1_ready and s2_ready:
                break
                
    return signatures

In [57]:
all_shingles = create_all_shingles(chapters, 7, stopWords)

In [74]:
n_hash = 10000
hash_family = create_hash_family(all_shingles, n_hash)

In [78]:
(shingle1, shingle2) = (shingles(chapters[99], 7, stopWords), shingles(chapters[100], 7, stopWords))

sig = signatures(shingle1, shingle2, all_shingles, hash_family)
sim = 1 - np.count_nonzero((sig[:,0] == sig[:,1]))/np.count_nonzero((sig[:,0] + sig[:,1] > 0))

In [79]:
sim

0.993

In [77]:
JDDS[99][0]

0.9922680412371134