In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


loss calculation

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define Siamese network architecture
def create_siamese_network(input_shape, embedding_dim, vocab_size):
    input_text = Input(shape=input_shape)
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
    lstm_layer = LSTM(128)
    encoded_text = lstm_layer(embedding_layer(input_text))
    return Model(inputs=input_text, outputs=encoded_text)

# Contrastive loss function
def contrastive_loss(y_true, y_pred, margin=1):
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

# Define inputs
max_length= 50
input_shape = (max_length,)
embedding_dim = 100
vocab_size = 10000

# Create Siamese network
siamese_network = create_siamese_network(input_shape, embedding_dim, vocab_size)



# input_text = tf.convert_to_tensor(["I am having fever"], dtype=tf.string)  # Shape: (1,)
# positive_texts = tf.convert_to_tensor(["I have fever", "I have insomnia", "Should I take medicine?"], dtype=tf.string)  # Shape: (3,)
# negative_texts = tf.convert_to_tensor(["I take medicine", "I feel exhausted", "Heart failure is very common between elderly people"], dtype=tf.string)  # Shape: (3,)

# Define your texts
texts = ["I am having fever",
         "I have fever",
         "I have insomnia",
         "Should I take medicine?",
         "I take medicine",
         "I feel exhausted",
         "Heart failure is very common between elderly people"]



# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

# Convert texts to sequences of word indices
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to the same length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Now you can pass these sequences to your model
input_text = padded_sequences[0:1]  # First sentence
positive_texts = padded_sequences[1:4]  # Next three sentences
negative_texts = padded_sequences[4:]  # Last three sentences


# Calculate embeddings
text_embedding = siamese_network(input_text)
positive_embeddings = siamese_network(positive_texts)
negative_embeddings = siamese_network(negative_texts)

# Calculate distances
positive_distances = Lambda(lambda tensors: K.sqrt(K.sum(K.square(tensors[0] - tensors[1]), axis=1)))([text_embedding, positive_embeddings])
negative_distances = Lambda(lambda tensors: K.sqrt(K.sum(K.square(tensors[0] - tensors[1]), axis=1)))([text_embedding, negative_embeddings])

# Define label (1 for positive set, 0 for negative set)
positive_label = tf.ones_like(positive_distances)
negative_label = tf.zeros_like(negative_distances)

# Compute contrastive loss
positive_loss = contrastive_loss(positive_label, positive_distances)
negative_loss = contrastive_loss(negative_label, negative_distances)

print("Contrastive Loss (Positive Set):", positive_loss.numpy())
print("Contrastive Loss (Negative Set):", negative_loss.numpy())


Contrastive Loss (Positive Set): 1.1520394e-09
Contrastive Loss (Negative Set): 0.9998476


In [None]:
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
import os

def extract_text_from_directory(directory):
    positive_array = []

    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath) and filename.endswith('.txt'):
            with open(filepath, 'r') as file:
                text = file.read()
                positive_array.append(text)

    return positive_array

def process_folders(root_directory):
    positive_texts = []

    for foldername in range(0,1000):  # Assuming folders are named from 1 to 10
        folderpath = os.path.join(root_directory, str(foldername))
        if os.path.isdir(folderpath):
            texts_in_folder = extract_text_from_directory(folderpath)
            positive_texts.extend(texts_in_folder)

    return positive_texts

root_directory = '/content/drive/MyDrive/Colab_Notebooks/Positive'  # Change this to your actual directory path
positive_texts = process_folders(root_directory)


print(positive_texts)



In [None]:
import os

def extract_text_from_directory(directory):
    negative_array = []

    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath) and filename.endswith('.txt'):
            with open(filepath, 'r') as file:
                text = file.read()
                negative_array.append(text)

    return negative_array

def process_folders(root_directory):
    negative_texts = []

    for foldername in range(0,1000):  # Assuming folders are named from 1 to 10
        folderpath = os.path.join(root_dir, str(foldername))
        if os.path.isdir(folderpath):
            texts_in_folder = extract_text_from_directory(folderpath)
            negative_texts.extend(texts_in_folder)

    return negative_texts

root_dir = '/content/drive/MyDrive/Colab_Notebooks/Negative'  # Change this to your actual directory path
negative_texts = process_folders(root_dir)


print(negative_texts)

['Who manufactures familial dysautonomia?', 'Daytrana Who manufactures cetirizine?', 'Who manufactures cetirizine? skin creams', 'Who manufactures paralysis?', "sheehan's syndrome Who manufactures bromocriptine?", 'Who manufactures bromocriptine? assistive devices', 'Who makes documented, and where can I buy it?', 'eating Who makes nulytely, and where can I buy it?', 'Who makes nulytely, and where can I buy it? anosmia', "Where can I get genetic testing for william's syndrome?", "Where can I get amyotrophic lateral sclerosis for william's syndrome?", "Alkaline Phosphatase Where can I get genetic testing for william's syndrome?", "Where can I get genetic testing for william's syndrome? primary biliary cholangitis", 'Where can I get genetic testing for multiple myeloma, and what is the cost?', 'Where can I get genetic testing for gender, and what is the cost?', 'lumps Where can I get genetic testing for multiple myeloma, and what is the cost?', 'Where can I get genetic testing for multip

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity




# Define Siamese network architecture
def create_siamese_network(input_shape, embedding_dim, vocab_size):
    input_text = Input(shape=input_shape)
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
    lstm_layer = LSTM(128)
    encoded_text = lstm_layer(embedding_layer(input_text))
    return Model(inputs=input_text, outputs=encoded_text)

# Contrastive loss function
def contrastive_loss(y_true, y_pred, margin=1):
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

# Define inputs
max_length= 50
input_shape = (max_length,)
embedding_dim = 100
vocab_size = 10000

# Create Siamese network
siamese_network = create_siamese_network(input_shape, embedding_dim, vocab_size)
predicted_text= ["Can you recommend a diet pill or shake that can help me lose 50 pounds in 2 months while working 16 hours a day?"]
positive_set = positive_texts
negative_set=  negative_texts


predicted_text_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
predicted_text_tokenizer.fit_on_texts(predicted_text)
positive_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
positive_tokenizer.fit_on_texts(positive_set)
negative_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
negative_tokenizer.fit_on_texts(negative_set)

predicted_sequence= predicted_text_tokenizer.texts_to_sequences(predicted_text)
positive_sequence= positive_tokenizer.texts_to_sequences(positive_set)
negative_sequence= negative_tokenizer.texts_to_sequences(negative_set)

predicted_padded_sequences = pad_sequences(predicted_sequence, maxlen=max_length, padding='post')
positive_padded_sequences = pad_sequences(positive_sequence, maxlen=max_length, padding='post')
negative_padded_sequences = pad_sequences(negative_sequence, maxlen=max_length, padding='post')

text_embedding = siamese_network(predicted_padded_sequences)
positive_embeddings = siamese_network(positive_padded_sequences)
negative_embeddings = siamese_network(negative_padded_sequences)


# Calculate distances
positive_distances = Lambda(lambda tensors: K.sqrt(K.sum(K.square(tensors[0] - tensors[1]), axis=1)))([text_embedding, positive_embeddings])
negative_distances = Lambda(lambda tensors: K.sqrt(K.sum(K.square(tensors[0] - tensors[1]), axis=1)))([text_embedding, negative_embeddings])

# Define label (1 for positive set, 0 for negative set)
positive_label = tf.ones_like(positive_distances)
negative_label = tf.zeros_like(negative_distances)

# Compute contrastive loss
positive_loss = contrastive_loss(positive_label, positive_distances)
negative_loss = contrastive_loss(negative_label, negative_distances)


print("Contrastive Loss (Positive Set):", positive_loss.numpy())
print("Contrastive Loss (Negative Set):", negative_loss.numpy())


Contrastive Loss (Positive Set): 0.00078606146
Contrastive Loss (Negative Set): 0.9977978


In [None]:
# Calculate cosine similarity between predicted text and positive set
cos_sim_pos = cosine_similarity(text_embedding, positive_embeddings)

# Calculate cosine similarity between predicted text and negative set
cos_sim_neg = cosine_similarity(text_embedding, negative_embeddings)

print("Cosine similarity with positive set:", cos_sim_pos)
print("Cosine similarity with negative set:", cos_sim_neg)

Cosine similarity with positive set: [[0.99999994 1.0000001  1.0000001 ]]
Cosine similarity with negative set: [[0.99999994 1.         1.        ]]
