In [1]:
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.colors as mcolors
import matplotlib.cm as cm

In [2]:
# This function reads and tokenizes sentences

def read_text_files(file_paths):
    sentences = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            sentences.extend(sent_tokenize(text))
    return sentences

In [3]:
# This function calculates the similarity between a list of sentences using TF-IDF vectorization and cosine similarity

def calculate_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix


In [4]:
# This function maps similarity values to colors using a colormap and returns the corresponding hex color codes

def map_similarity_to_color(similarity_matrix):
    norm = mcolors.Normalize(vmin=0, vmax=1)
    colormap = cm.get_cmap('hsv')
    
    colors = []
    for i in range(len(similarity_matrix)):
        avg_similarity = np.mean(similarity_matrix[i])
        color = colormap(norm(avg_similarity))
        colors.append(mcolors.rgb2hex(color[:3]))
    return colors


In [5]:
# This function takes a list of sentences and a list of colors, 
# and generates an HTML content with each sentence displayed with the corresponding background color

def generate_html(sentences, colors):
    html_content = "<html><body>"
    for sentence, color in zip(sentences, colors):
        html_content += f'<span style="background-color: {color};">{sentence} </span>'
    html_content += "</body></html>"
    return html_content

# This function saves the generated HTML content to a specified output path

def save_html(html_content, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(html_content)


In [6]:
def main(file_paths, output_path):
    sentences = read_text_files(file_paths)
    similarity_matrix = calculate_similarity(sentences)
    colors = map_similarity_to_color(similarity_matrix)
    html_content = generate_html(sentences, colors)
    save_html(html_content, output_path)

In [7]:
file_paths = ['C:\\Users\\LENOVO\\Desktop\\file1.txt', 'C:\\Users\\LENOVO\\Desktop\\file2.txt']  
output_path = 'C:\\Users\\LENOVO\\Desktop\\Output.html'
main(file_paths, output_path)

  colormap = cm.get_cmap('hsv')
