In [49]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

In [50]:
import numpy as np


texts = [
    ["Soccer is a popular sport", "Basketball is a team sport"],
    ["The Eiffel Tower is in Paris", "The Statue of Liberty is in New York"],
    ["Mangoes are a tropical fruit", "Pineapples are also tropical fruits"],
    ["Hiking in the mountains is refreshing", "Swimming in the ocean is relaxing"],
    ["Computer science involves coding", "Biology focuses on living organisms"],
    ["The Earth orbits the Sun", "The Moon orbits the Earth"],
    ["Cats have whiskers", "Snakes are limbless reptiles"],
    ["Writing code can be challenging", "Reading books is a leisurely activity"],
    ["Chemistry deals with chemical reactions", "Physics studies the laws of nature"],
    ["Mount Everest is the world's tallest peak", "K2 is also a tall mountain"],
    ["Artists create visual masterpieces", "Musicians compose melodious tunes"],
    ["Summer is hot and sunny", "Winter is cold and snowy"],
    ["Insects have six legs", "Spiders are arachnids with eight legs"],
    ["London is the capital of England", "Tokyo is the capital of Japan"],
    ["Learning a new language is a valuable skill", "Cooking delicious food is an art"],
    ["Elephants are large mammals", "Kangaroos are marsupials"],
    ["Rivers flow downstream to the sea", "Waterfalls are breathtaking natural wonders"],
    ["Astronomy explores celestial objects", "Geology studies the Earth's structure"],
    ["Singing requires vocal talent", "Dancing showcases body movements"],
    ["Gardening is a peaceful hobby", "Extreme sports offer adrenaline rushes"]
]
labels = [1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(texts)
max_seq_length = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_seq_length)

input_pairs = [(sequences[i], sequences[j]) for i in range(len(texts)) for j in range(len(texts)) if i != j]
labels = np.array(labels * (len(texts) - 1))  

indices = np.arange(len(input_pairs))
np.random.shuffle(indices)
input_pairs = np.array(input_pairs)[indices]
labels = labels[indices]


In [51]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K


input_a = Input(shape=(max_seq_length,))
input_b = Input(shape=(max_seq_length,))

embedding_layer = Embedding(input_dim=vocab_size, output_dim=128)

encoded_a = embedding_layer(input_a)
encoded_b = embedding_layer(input_b)

def cosine_similarity(vectors):
    x, y = vectors
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return K.sum(x * y, axis=-1)

similarity_layer = Lambda(cosine_similarity, output_shape=(1,))([encoded_a, encoded_b])

siamese_model = Model(inputs=[input_a, input_b], outputs=similarity_layer)


In [52]:

siamese_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [53]:

siamese_model.fit([input_pairs[:, 0], input_pairs[:, 1]], labels, epochs=10, batch_size=32)


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x184d17726e0>

In [54]:
ai_definition = "Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks requiring human intelligence. These tasks include learning from experience (machine learning), understanding natural language, recognizing patterns, and solving problems. AI can be broadly categorized into Narrow AI (specialized for specific tasks) and General AI (possessing human-like cognitive abilities across a range of tasks)."

sl_definition = "Supervised Learning is a type of machine learning where the algorithm is trained on a labeled dataset. In supervised learning, the algorithm is provided with input-output pairs, where the input data is labeled with the corresponding correct output. The algorithm learns to map the input data to the correct output by generalizing from the labeled examples. The goal is to make predictions or classifications on new, unseen data. Examples of supervised learning tasks include image classification, speech recognition, and regression analysis."

ul_definition = "Unsupervised Learning is a type of machine learning where the algorithm is given input data without explicit instructions on what to do with it. The system tries to learn the patterns and structure from the data without labeled outputs. The goal is often to discover hidden patterns, relationships, or groupings within the data. Clustering and dimensionality reduction are common tasks in unsupervised learning. Examples include clustering similar documents, identifying topics in a collection of articles, and reducing the dimensionality of data for visualization."


In [55]:
ai_definition_updated = "Artificial Intelligence (AI) encompasses the development of computer systems capable of emulating human intelligence. This includes tasks like learning from experience (machine learning), comprehending natural language, identifying patterns, and solving problems. AI is broadly categorized into Narrow AI (specialized tasks) and General AI (human-like cognitive abilities)."

sl_definition_updated = "In Supervised Learning, algorithms are trained on labeled datasets. Input-output pairs are provided, where input data is labeled with correct outputs. Algorithms learn to generalize from these labeled examples, making predictions or classifications on new, unseen data. Examples include image classification, speech recognition, and regression analysis."

ul_definition_updated = "Unsupervised Learning involves algorithms processing unlabeled data without explicit guidance. The goal is to discover patterns or relationships within the data. Common tasks include clustering similar data points and reducing dimensionality for visualization. Examples include grouping documents, identifying topics, and exploring data structure."


In [56]:
# Example new text pair
new_text_pair = [ai_definition, ai_definition_updated]


# Preprocess the new text pair (tokenization and padding)
new_sequences = tokenizer.texts_to_sequences(new_text_pair)
new_sequences = pad_sequences(new_sequences, maxlen=max_seq_length)

# Make predictions
similarity_score = siamese_model.predict([new_sequences[0], new_sequences[1]])

# Print the similarity score
print(f"Similarity Score: {similarity_score[0]}")

# You can define a threshold to decide if the texts are similar or dissimilar
threshold = 0.5  # Adjust this threshold as needed
if similarity_score[0] > threshold:
    print("The texts are similar.")
else:
    print("The texts are dissimilar.")


Similarity Score: 1.0
The texts are similar.


In [57]:
validation_pairs = [
    ("I love cats", "I adore felines"),
    ("Dogs are loyal animals", "Cats are independent creatures"),
    ("Apples are red", "Bananas are yellow"),
    ("Pizza is delicious", "Ice cream is sweet"),
    ("Python is a programming language", "Java is also a programming language"),
    ("The sun rises in the east", "The moon shines at night"),
]


validation_labels = [1, 0, 0, 1, 1, 0]

In [58]:
validation_sequences = tokenizer.texts_to_sequences([pair[0] for pair in validation_pairs])
validation_sequences = pad_sequences(validation_sequences, maxlen=max_seq_length)
validation_sequences_2 = tokenizer.texts_to_sequences([pair[1] for pair in validation_pairs])
validation_sequences_2 = pad_sequences(validation_sequences_2, maxlen=max_seq_length)

similarity_scores = siamese_model.predict([validation_sequences, validation_sequences_2])



In [59]:
threshold = 0.5

true_positives = 0
true_negatives = 0
false_positives = 0
false_negatives = 0


for i in range(len(validation_labels)):
    if validation_labels[i] == 1:
        if similarity_scores[i][0] >= threshold:
            true_positives += 1
        else:
            false_negatives += 1
    else:
        if similarity_scores[i][0] >= threshold:
            false_positives += 1
        else:
            true_negatives += 1


precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"F1 Score: {f1_score:.2f}")


F1 Score: 0.67
