In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
dataset_a_path = 'BLIP_with_captions.csv'
dataset_b_path = 'dataset_with_captions.csv'

# Read datasets
data_a = pd.read_csv(dataset_a_path)
data_b = pd.read_csv(dataset_b_path)

# Extract captions
captions_a = data_a['meaningful-caption'].tolist()
captions_b = data_b['caption'].tolist()

# Load a pre-trained text embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and fast

# Encode captions into vector space
embeddings_a = model.encode(captions_a, convert_to_tensor=True)
embeddings_b = model.encode(captions_b, convert_to_tensor=True)

# Compute pairwise cosine similarity between Dataset A and Dataset B
similarity_matrix = cosine_similarity(embeddings_a, embeddings_b)

# Find the closest match for each caption in Dataset A
closest_matches = np.argmax(similarity_matrix, axis=1)

# Map closest matches
matched_captions = [
    {'caption_a': captions_a[i], 'caption_b': captions_b[closest_matches[i]], 'similarity': similarity_matrix[i, closest_matches[i]]}
    for i in range(len(captions_a))
]

# Convert matched results to a DataFrame
matched_df = pd.DataFrame(matched_captions)

# Save the results
matched_df.to_csv('matched_captions.csv', index=False)

print("Closest matches saved to 'matched_captions.csv'.")

In [2]:
# Add text2vec values to the original datasets
data_a['text2vec'] = [embedding.tolist() for embedding in embeddings_a]
data_b['text2vec'] = [embedding.tolist() for embedding in embeddings_b]

# Save text2vec embeddings for inspection
data_a.to_csv('dataset_a_with_embeddings.csv', index=False)
data_b.to_csv('dataset_b_with_embeddings.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from textblob import TextBlob
import nltk
import spacy
import matplotlib.pyplot as plt

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load a pre-trained language model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
nlp = spacy.load("en_core_web_sm")

# --- Feature Extraction ---

# Text-to-vec embeddings
embeddings_a = model.encode(captions_a, convert_to_tensor=False)
embeddings_b = model.encode(captions_b, convert_to_tensor=False)
data_a['text2vec'] = [embedding.tolist() for embedding in embeddings_a]
data_b['text2vec'] = [embedding.tolist() for embedding in embeddings_b]

# Length features
data_a['word_count'] = data_a['meaningful-caption'].apply(lambda x: len(str(x).split()))
data_a['char_count'] = data_a['meaningful-caption'].apply(len)

# Sentiment analysis
data_a['sentiment_polarity'] = data_a['meaningful-caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
data_a['sentiment_subjectivity'] = data_a['meaningful-caption'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

# Lexical diversity
data_a['lexical_diversity'] = data_a['meaningful-caption'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)

# Named Entity Recognition (NER)
data_a['entities'] = data_a['meaningful-caption'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])

# Topic modeling with LDA
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(data_a['caption'])
lda = LatentDirichletAllocation(n_components=3, random_state=42)  # 3 topics
lda.fit(dtm)
data_a['topic'] = lda.transform(dtm).argmax(axis=1)

# Keyword extraction with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10)
tfidf_matrix = tfidf_vectorizer.fit_transform(data_a['meaningful-caption'])
keywords = tfidf_vectorizer.get_feature_names_out()
data_a['keywords'] = [", ".join([keywords[i] for i in tfidf_matrix[row].indices]) for row in range(tfidf_matrix.shape[0])]

# --- Comparison Between Datasets ---

# Cosine similarity
similarity_matrix = cosine_similarity(embeddings_a, embeddings_b)

# Match closest captions
closest_matches = np.argmax(similarity_matrix, axis=1)
data_a['closest_caption'] = [captions_b[closest_matches[i]] for i in range(len(captions_a))]
data_a['similarity_score'] = [similarity_matrix[i, closest_matches[i]] for i in range(len(captions_a))]

# Jaccard similarity
def jaccard_similarity(a, b):
    set_a = set(a.split())
    set_b = set(b.split())
    return len(set_a & set_b) / len(set_a | set_b)

data_a['jaccard_with_b'] = data_a['meaningful-caption'].apply(lambda x: max([jaccard_similarity(x, y) for y in captions_b]))

# Cluster analysis
kmeans = KMeans(n_clusters=5, random_state=42)
data_a['cluster'] = kmeans.fit_predict(embeddings_a)

# --- Visualization ---

# Distribution of similarity scores
similarity_scores = similarity_matrix.flatten()
plt.hist(similarity_scores, bins=20)
plt.title("Distribution of Similarity Scores")
plt.xlabel("Similarity")
plt.ylabel("Frequency")
plt.show()

# --- Save Results ---

# Save enriched datasets
data_a.to_csv("dataset_a_features.csv", index=False)
data_b.to_csv("dataset_b_features.csv", index=False)

# Save similarity matches
matched_captions = pd.DataFrame({
    'caption_a': captions_a,
    'closest_caption_b': [captions_b[closest_matches[i]] for i in range(len(captions_a))],
    'similarity_score': [similarity_matrix[i, closest_matches[i]] for i in range(len(captions_a))]
})
matched_captions.to_csv("matched_captions.csv", index=False)

print("Features and results saved to CSV files.")
