1st idea:Measuring similarity between couple of sentences in a web page 

In [None]:
import os
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np
import webbrowser

# Download NLTK data if not already downloaded
nltk.download('punkt')

# List of file paths
file_paths = [
    "C:\\Users\\BCC\\OneDrive\\Desktop\\ba.txt", 
    "C:\\Users\\BCC\\OneDrive\\Desktop\\ba2.txt"
]

def read_text_files(file_paths):
    texts = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            texts.append(file.read())
    return texts

def tokenize_sentences(texts):
    sentences = []
    for text in texts:
        sentences.extend(sent_tokenize(text))
    return sentences

def calculate_similarity(sentences):
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    return cosine_matrix

def generate_colors(similarity_matrix):
    num_sentences = len(similarity_matrix)
    colors = []
    for i in range(num_sentences):
        avg_similarity = np.mean(similarity_matrix[i])
        color_intensity = avg_similarity
        colors.append(plt.cm.Blues(color_intensity)[:3])  # Use only RGB channels
    return colors

def calculate_mean_similarity(similarity_matrix):
    mean_similarities = np.mean(similarity_matrix, axis=1)
    return mean_similarities

def sentences_to_html(sentences, colors, similarity_matrix, mean_similarities):
    html = '''
    <html>
    <head>
    <style>
    .sentence { display: block; padding: 2px; margin: 2px 0; cursor: pointer; }
    .highlighted { background-color: yellow !important; } /* Highlight color set to yellow */
    #similarity-box { 
        display: none; 
        position: fixed; 
        top: 10px; 
        left: 50%; 
        transform: translateX(-50%); 
        padding: 10px; 
        border: 1px solid #000; 
        background-color: #fff; 
        z-index: 1000;
    }
    </style>
    <script>
    function showSimilarity(event, sentenceIndex) {
        var meanSimilarities = ''' + str(mean_similarities.tolist()) + ''';
        var similarityBox = document.getElementById('similarity-box');
        var similarityText = '<strong>Mean Similarity:</strong> ' + meanSimilarities[sentenceIndex].toFixed(2);
        similarityBox.innerHTML = similarityText;
        similarityBox.style.top = (event.clientY + 10) + 'px';
        similarityBox.style.left = (event.clientX + 10) + 'px';
        similarityBox.style.display = 'block';

        var similarities = ''' + str(similarity_matrix.tolist()) + ''';
        var sentences = document.getElementsByClassName('sentence');
        for (var i = 0; i < sentences.length; i++) {
            sentences[i].classList.remove('highlighted');
        }
        for (var i = 0; i < similarities[sentenceIndex].length; i++) {
            if (similarities[sentenceIndex][i] > 0.5 && i !== sentenceIndex) { // Adjust threshold as needed
                sentences[i].classList.add('highlighted');
            }
        }
    }

    document.onmousemove = function(event) {
        if (!event.target.classList.contains('sentence')) {
            document.getElementById('similarity-box').style.display = 'none';
            var sentences = document.getElementsByClassName('sentence');
            for (var i = 0; i < sentences.length; i++) {
                sentences[i].classList.remove('highlighted');
            }
        }
    };
    </script>
    </head>
    <body style="font-family: Arial, sans-serif; line-height: 1.5;">
    <div id="similarity-box"></div>
    <div>
    <strong>Mean Similarities:</strong><br>'''

    for idx, mean_similarity in enumerate(mean_similarities):
        html += f'Sentence {idx + 1}: {mean_similarity:.2f}<br>'

    html += '</div><br>'

    for idx, (sentence, color) in enumerate(zip(sentences, colors)):
        color_hex = '#%02x%02x%02x' % tuple(int(c * 255) for c in color)
        html += f'<span class="sentence" style="background-color:{color_hex};" onmouseover="showSimilarity(event, {idx})">{sentence}</span>\n'
    
    html += '</body></html>'
    return html

# Read, preprocess, and process text files
texts = read_text_files(file_paths)
sentences = tokenize_sentences(texts)
cosine_matrix = calculate_similarity(sentences)
colors = generate_colors(cosine_matrix)
mean_similarities = calculate_mean_similarity(cosine_matrix)
html_output = sentences_to_html(sentences, colors, cosine_matrix, mean_similarities)

# Save the HTML output 
output_file_path = 'output.html'
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.write(html_output)

# Open the HTML file in the default web browser
webbrowser.open(f'file://{os.path.abspath(output_file_path)}')

print("HTML file generated and opened successfully.")
