In [None]:
%pip install matplotlib transformers numpy sentence_transformers annoy nltk spacy graphviz

In [None]:
import sys
!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download en_core_web_sm

In [44]:
from sentence_transformers import SentenceTransformer
from collections import OrderedDict
from annoy import AnnoyIndex
import re
import nltk
import spacy

def read_file(file_path):
    with open(file_path, 'r') as f:
        text = f.read()
    return text

def pre_process(sentences):
    processed_sentences = [re.sub('\s+', ' ', sentence).strip() for sentence in sentences]
    # Remove duplicates but preserve order
    processed_sentences = list(OrderedDict.fromkeys(processed_sentences))
    return processed_sentences

def split_into_sentences_ntlk(text):
    nltk.download('punkt')  # Download the Punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(text)
    # Pre-process the sentences to standardize whitespace
    sentences = pre_process(sentences)
    return sentences

def split_into_sentences(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    sentences = pre_process(sentences)
    return sentences

def generate_embeddings(sentences, model):
    embeddings = model.encode(sentences)
    return embeddings

def build_annoy_index(embeddings, file_name='annoy_index.ann'):
    dimension = embeddings[0].shape[0]
    annoy_index = AnnoyIndex(dimension, 'angular')
    for i, embedding in enumerate(embeddings):
        annoy_index.add_item(i, embedding)

    # Build the index
    annoy_index.build(10)  # 10 trees
    annoy_index.save(file_name)


In [45]:
file_path = 'tom_sawyer.txt'
# Read the file
text = read_file(file_path)

# Split the text into sentences
sentences = split_into_sentences(text)

In [46]:
model_name = 'sentence-transformers/all-mpnet-base-v2'

# Load the transformer model
model = SentenceTransformer(model_name)

# Generate embeddings for the sentences
embeddings = generate_embeddings(sentences, model)

In [47]:
build_annoy_index(embeddings)

In [49]:
annoy_index = AnnoyIndex(embeddings[0].shape[0], 'angular')
annoy_index.load('annoy_index.ann')  # load saved index

True

In [50]:
def get_nearest_sentences(annoy_index, embeddings, num_nearest):
    all_nearest_sentences = []
    all_distances = []

    for i, _ in enumerate(embeddings):
        nearest_sentences = annoy_index.get_nns_by_item(i, num_nearest + 1)  # +1 because the sentence is closest to itself
        nearest_sentences.remove(i)  # remove the sentence itself

        distances = [annoy_index.get_distance(i, j) for j in nearest_sentences]
        all_distances.extend(distances)

        all_nearest_sentences.append(nearest_sentences)

    return all_nearest_sentences, all_distances

def plot_distance_distribution(all_distances):
    plt.hist(all_distances, bins=30)
    plt.title('Distribution of distances to nearest sentences')
    plt.xlabel('Distance')
    plt.ylabel('Frequency')
    plt.show()
    
num_nearest = 42
all_nearest_sentences, all_distances = get_nearest_sentences(annoy_index, embeddings, num_nearest)


In [None]:
import matplotlib.pyplot as plt

plot_distance_distribution(all_distances)


In [None]:
sorted_indexes = sorted(range(len(all_distances)), key=lambda k: all_distances[k])

# Print few most similar sentences
for i in sorted_indexes[:150]:  # 10 can be replaced with the number of examples you want to print
    sentence_index = i // num_nearest
    nearest_sentence_index = all_nearest_sentences[sentence_index][i % num_nearest]

    print(f"Sentence: {sentences[sentence_index]}")
    print(f"Most similar sentence: {sentences[nearest_sentence_index]}")
    print(f"Distance: {all_distances[i]}")
    print("------------------------")

In [13]:
sentences[all_nearest_sentences[1][2 % num_nearest]]

'• You comply with all other terms of this agreement for free\n        distribution of Project Gutenberg™ works.'

In [56]:
from graphviz import Digraph

def create_graph_file(sentences, all_nearest_sentences, annoy_index, threshold, filename='graph.dot'):
    dot = Digraph()
    
    for i, sentence in enumerate(sentences):
        dot.node(str(i), sentence)
    
    for i, nearest_sentences in enumerate(all_nearest_sentences):
        for j in nearest_sentences:
            distance = annoy_index.get_distance(i, j)
            if distance < threshold:
                dot.edge(str(i), str(j), label=str(distance))
                
    dot.save(filename)

In [58]:
# Create graph file
create_graph_file(sentences, all_nearest_sentences, annoy_index, threshold=0.9)


In [None]:
# Now for each sentence, let's plot a line chart that shows distance from the sentence
# to "Happy" and "Sad" respectively.
# We will use the same color scheme as above.

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Assuming you have the embeddings as provided in your code snippet

# Define a happy sentence to compare with
happy_sentence = "I am very happy!"
happy_embedding = model.encode([happy_sentence])

# Calculate the cosine similarity between the happy embedding and each sentence embedding
distances = [cosine_similarity([embedding], [happy_embedding[0]])[0][0] for embedding in embeddings]

# Plot the distances
plt.plot(distances)
plt.xlabel('Sentence Index')
plt.ylabel('Cosine Similarity to Happy Sentiment')
plt.title('Distance Between Sentences and Happy Sentiment')
plt.show()


In [67]:
import numpy as np

# Convert distances to a numpy array for easy manipulation
distances_array = np.array(distances)

# Get the indices of the top 3 nearest and farthest sentences
nearest_indices = distances_array.argsort()[-3:][::-1]
farthest_indices = distances_array.argsort()[:3]

# Retrieve the corresponding sentences
nearest_sentences = [sentences[i] for i in nearest_indices]
farthest_sentences = [sentences[i] for i in farthest_indices]

print("Top 3 Nearest Sentences:")
for sentence in nearest_sentences:
    print(sentence)

print("\nTop 3 Farthest Sentences:")
for sentence in farthest_sentences:
    print(sentence)

Top 3 Nearest Sentences:
I’m satisfied with it.
Sid seemed satisfied.
The neighboring spectators shook with a gentle inward joy, several faces went behind fans and hand-kerchiefs, and Tom was entirely happy.

Top 3 Farthest Sentences:
She had sunk into a dreary apathy and would not be roused.
The choir always tittered and whispered all through service.
Their speed was slow, however, because pitfalls were somewhat common, and had to be guarded against.
