In [1]:
import difflib
import re

# Load the texts of Edition I, Edition II, and Restored into variables
with open('text_files/1725 textfile.txt', 'r') as file1:
    edition1_text = file1.read()

with open('text_files/1728 textfile.txt', 'r') as file2:
    edition2_text = file2.read()

with open('text_files/Dramatis personae string list.txt', 'r') as names:
    names = names.readlines()
    names_list = [s.replace("\n", "") for s in names]
    names_list = [s.replace("\ufeff", "") for s in names_list]

In [2]:
names_list

['fran.',
 'ber.',
 'mar.',
 'mer.',
 'hor.',
 'ber.',
 'king.',
 'vol.',
 'volt.',
 'laer.',
 'ham.',
 'queen.',
 'all.',
 'both.',
 'oph.',
 'pol.',
 'ghost.',
 'rey.',
 'ros.',
 'rof.',
 'guil.',
 '1 clown.',
 '2 clown.',
 'clown.',
 'osr.',
 'ofr.',
 'priest.',
 'sail.',
 'mes.',
 'mef.',
 'play.',
 'for.',
 'fort.',
 'capt.',
 'cap.',
 'gentlemen within.',
 'gent.',
 'ser.',
 'sail.',
 'lord.',
 'amb.']

In [3]:
def data_clean(text):
    clean_text = text.lower()
    clean_text = clean_text.replace("\n", "")
    clean_text = clean_text.replace("\\", "")
    clean_text = clean_text.replace("/", "")
    clean_text = clean_text.replace("-", "")
    clean_text = clean_text.replace("\ufeff", "")
 
    return clean_text

In [4]:
clean_text_1725 = data_clean(edition1_text)
clean_text_1728 = data_clean(edition2_text)

In [5]:
def split_text(hamlet_text, character_names):

    # Create a regular expression pattern to match the character names
    pattern = r'(' + '|'.join(re.escape(name) for name in character_names) + r')'

    # Split the text based on the character names while keeping the character's dialogue
    split_text = re.split(pattern, hamlet_text)

    # Clean up any leading or trailing whitespace and remove empty strings
    split_text = [segment.strip() for segment in split_text if segment.strip()]

    # Reconstruct the character dialogues
    character_dialogues = []
    current_character = None

    for segment in split_text:
        if segment in character_names:
            current_character = segment
            character_dialogues.append(current_character + ":")
        else:
            if current_character is not None:
                character_dialogues[-1] += " " + segment

    return character_dialogues



In [6]:
character_dialogues_1725 = split_text(clean_text_1725, names_list)
character_dialogues_1728 = split_text(clean_text_1728, names_list)

In [7]:
from fuzzywuzzy import fuzz

list1 = character_dialogues_1725
list2 = character_dialogues_1728

similarity_scores = [fuzz.ratio(str1, str2) for str1 in list1 for str2 in list2]
print("Levenshtein Similarity (max score):", max(similarity_scores))




Levenshtein Similarity (max score): 100


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

list1 = character_dialogues_1725
list2 = character_dialogues_1728

vectorizer = CountVectorizer().fit_transform([','.join(list1), ','.join(list2)])
vectors = vectorizer.toarray()
cosine_sim = cosine_similarity(vectors)
print("Cosine Similarity:", cosine_sim[0][1])


Cosine Similarity: 0.9988749921370038


In [9]:
list1 = character_dialogues_1725
list2 = character_dialogues_1728

set1 = set(list1)
set2 = set(list2)

jaccard_similarity = len(set1.intersection(set2)) / len(set1.union(set2))
print("Jaccard Similarity:", jaccard_similarity)


Jaccard Similarity: 0.23669623059866962


In [12]:
from fuzzywuzzy import fuzz

list1 = character_dialogues_1725
list2 = character_dialogues_1728

# Define a threshold for similarity (e.g., 80% similarity)
threshold = 80
def similarity_threshold(list1, list2, threshold):
    differences = []
    similarities = []

    for str1 in list1:
        found = False
        for str2 in list2:
            similarity_score = fuzz.ratio(str1, str2)
            if similarity_score >= threshold:
                similarities.append((str1, str2, similarity_score))
                found = True
                break
        if not found:
            differences.append(str1)

    for str2 in list2:
        found = False
        for str1 in list1:
            similarity_score = fuzz.ratio(str1, str2)
            if similarity_score >= threshold:
                found = True
                break
        if not found:
            differences.append(str2)

    # Create a text file for similarities
    with open("clean_files/similarities1.txt", "w") as similarities_file:
        for str1, str2, score in similarities:
            similarities_file.write(f"{str1} and {str2} (Similarity: {score}%)\n")

    # Create a text file for differences
    with open("clean_files/differences1.txt", "w") as differences_file:
        for diff in differences:
            differences_file.write(f"{diff}\n")


In [14]:
list1 = character_dialogues_1725
list2 = character_dialogues_1728

similarity_threshold(list1,list2,25)