In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
### using cosine similarity 

In [3]:
def calculate_similarity(file1_path, file2_path):
    # Read the contents of both files
    with open(file1_path, 'r', encoding='utf-8') as f:
        text1 = f.read()

    with open(file2_path, 'r', encoding='utf-8') as f:
        text2 = f.read()

    # Tokenize the strings into sentences
    words1 = sent_tokenize(text1.lower())
    words2 = sent_tokenize(text2.lower())

    # Create a set of all the unique words in both texts
    word_set = set(words1 + words2)

    # Create dictionaries to represent the frequency of each word
    dict1 = {word: 0 for word in word_set}
    dict2 = {word: 0 for word in word_set}

    for sent in words1:
        dict1[sent] += 1

    for sent in words2:
        dict2[sent] += 1

    # Convert each dictionary to a bag-of-words representation
    bag1 = [dict1[word] for word in word_set]
    bag2 = [dict2[word] for word in word_set]

    # Calculate cosine similarity between the two bag-of-words representations
    X = np.array([bag1, bag2], dtype=object)
    cos_sim = cosine_similarity(X)

    # Return the cosine similarity as a percentage
    return cos_sim[0][1] * 100

In [4]:
#keyword based
similarity_percentage = calculate_similarity("pure_test_unstructured.txt", "keyword_output1.txt")
print(f"The similarity between the two files is {similarity_percentage:.2f}%")

FileNotFoundError: [Errno 2] No such file or directory: 'keyword_output1.txt'

In [None]:
#regular expression based
similarity_percentage = calculate_similarity("pure_test_unstructured.txt", "rule_output1.txt")
print(f"The similarity between the two files is {similarity_percentage:.2f}%")

In [None]:
#deep learing based
similarity_percentage = calculate_similarity("pure_test_unstructured.txt", "dl_output1.txt")
print(f"The similarity between the two files is {similarity_percentage:.2f}%")

In [None]:
### jaccard similarity

In [None]:
def calculate_similarity(file1_path, file2_path):
    # Read the contents of both files
    with open(file1_path, 'r', encoding='utf-8') as file1:
        file1_data = file1.readlines()

    with open(file2_path, 'r', encoding='utf-8') as file2:
        file2_data = file2.readlines()

    # Combine the contents of both files
    combined_data = file1_data + file2_data

    # Create a TF-IDF vectorizer and fit it on the combined data
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined_data)

    # Perform K-means clustering on the TF-IDF matrix
    num_clusters = 2  # Adjust the number of clusters as needed
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)

    # Get the cluster labels for each sentence
    labels = kmeans.labels_

    # Map each sentence index to its respective cluster label
    sentence_clusters = {}
    for i, label in enumerate(labels):
        if i < len(file1_data):
            sentence_clusters[file1_data[i]] = label
        else:
            sentence_clusters[file2_data[i - len(file1_data)]] = label

    # Calculate Jaccard similarity between the sets of sentences in each cluster
    jaccard_similarities = []
    for cluster_label in set(labels):
        cluster_sentences_file1 = [sentence for sentence, label in sentence_clusters.items() if label == cluster_label and sentence in file1_data]
        cluster_sentences_file2 = [sentence for sentence, label in sentence_clusters.items() if label == cluster_label and sentence in file2_data]

        intersection = len(set(cluster_sentences_file1) & set(cluster_sentences_file2))
        union = len(set(cluster_sentences_file1) | set(cluster_sentences_file2))

        jaccard_similarity = intersection / union
        jaccard_similarities.append(jaccard_similarity)

    # Calculate the average Jaccard similarity across all clusters
    similarity = np.mean(jaccard_similarities) * 100
    
    return similarity

In [None]:
#keyword based
similarity = calculate_similarity("pure_test_unstructured.txt", "keyword_output1.txt")
print(f"The similarity between the files is {similarity:.2f}%")

In [None]:
#regular expression based
similarity = calculate_similarity("pure_test_unstructured.txt", "rule_output1.txt")
print(f"The similarity between the files is {similarity:.2f}%")

In [None]:
#deep learing based
similarity = calculate_similarity("pure_test_unstructured.txt", "dl_output1.txt")
print(f"The similarity between the files is {similarity:.2f}%")