In [1]:
import pandas as pd
import os
import ast
import re
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from PIL import Image

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
from hdbscan import HDBSCAN
from sklearn.cluster import AgglomerativeClustering

In [3]:
# Check which device pytorhc is using
if torch.backends.mps.is_available():
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
elif torch.cuda.is_available():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.ones(1, device=device)
    print (x)
else:
    print ("MPS and CUDA devices not found.")

tensor([1.], device='mps:0')


In [4]:
# Functions to save recommendations
def display_images(filepath, image_ids, nrow=5, save_images = False):
    # Load images
    images = [
        Image.open(os.path.join("../../GPT/images", f"{image_id}.jpg")) for image_id in image_ids
    ]

    # Calculate the size of the grid
    nrows = (len(images) + nrow - 1) // nrow
    max_widths = [0] * nrow
    max_heights = [0] * nrows
    for i, img in enumerate(images):
        row, col = divmod(i, nrow)
        max_widths[col] = max(max_widths[col], img.width)
        max_heights[row] = max(max_heights[row], img.height)

    total_width = sum(max_widths)
    total_height = sum(max_heights)

    # Create a new blank image for the grid
    grid_image = Image.new("RGB", (total_width, total_height))

    # Paste images into the grid
    y_offset = 0
    for row in range(nrows):
        x_offset = 0
        for col in range(nrow):
            if row * nrow + col < len(images):
                grid_image.paste(images[row * nrow + col], (x_offset, y_offset))
            x_offset += max_widths[col]
        y_offset += max_heights[row]
    if save_images:
        grid_image.save(filepath)
    # grid_image.show()
    print(f"Image saved as {filepath}")

In [5]:
DATA_DIR = "data"
IMAGE_DIR = "../Embedding/images"
OUTPUT_DIR = "output"

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')



In [6]:
# a function to get metadata
def get_metadata():
    metadata = pd.read_csv(os.path.join(DATA_DIR, "tags_replaced.csv"), index_col=0)
    metadata["tags"] = metadata["tags"].apply(ast.literal_eval)
    return metadata

# a function to convert string of list to actual list
def string_to_list(string):
    string = string.replace("[", "")
    string = string.replace("]", "")
    string = string.replace("'", "")
    string = string.split(", ")
    return string

# a function to remove parentheses from a string
def remove_parentheses(text):
    return re.sub(r'\([^)]*\)', '', text).strip()

# This function takes the metadata and returns the dictionary of artists, and the tags of style, theme, and movement
def get_artwork_tags(df):
    results = []
    for index, row in df.iterrows():
        artist = remove_parentheses(row['artist_display'])
        styles = string_to_list(row['style_tags'])
        themes = string_to_list(row['theme_tags'])
        movements = string_to_list(row['movement'])
        results.append({
            "artists": [artist],
            "styles": styles,
            "themes": themes,
            "movements": movements
        })
    return results

# a function to get the embeddings of the artwork as the weighted embeddings of the tags
def create_embeddings(object, model = model):
    artists = object['artists']
    styles = object['styles']
    themes = object['themes']
    movements = object['movements']
    #np.mean(tag_embeddings, axis=0)
    artist_embeddings = np.mean(model.encode(artists), axis=0)
    style_embeddings = np.mean(model.encode(styles), axis=0)
    theme_embeddings = np.mean(model.encode(themes), axis=0)
    movement_embeddings = np.mean(model.encode(movements), axis=0)
    # return result in a tensor and assigne to device
    results = torch.tensor(np.array([artist_embeddings, style_embeddings, theme_embeddings, movement_embeddings])).to(device)
    return results

# a function to get all tags of an artwork
def get_new_tags(row):
    artist = remove_parentheses(row['artist_display'])
    medium = row['medium']
    tags = list(row['tags'])
    tags.insert(0, medium)
    tags.insert(0, artist)
    return tags
    
# calculate the similarity between two tensors using GPU with either cosine similarity, euclidean distance, or dot product
def calculate_tensor_similarity(tensor1, tensor2, method = 'cosine'):
    if method == 'cosine':
        result =  torch.nn.functional.cosine_similarity(tensor1, tensor2).cpu().detach().numpy()
    elif method == 'euclidean':
        result =  - (torch.nn.functional.pairwise_distance(tensor1, tensor2, p=1).cpu().detach().numpy())
    else:
        result = np.array([torch.dot(tensor1[i], tensor2[i]).cpu().detach().numpy() for i in range(len(tensor1))])
        # for row1, row2 in zip(tensor1, tensor2):
        #     dot_product = row1@row2
        #     result.append(dot_product)
    # print(result.shape)
    return np.average(result, weights = [0.1, 0.3, 0.3, 0.3], axis=0)

# find the top k most similar artworks to the new user
def get_top_k_similar_artworks(user_embeddings, artwork_embeddings, k = 10, method = 'cosine'):
    results = []
    for i, artwork in enumerate(artwork_embeddings):
        similarity = calculate_tensor_similarity(user_embeddings, artwork, method)
        results.append((i, similarity))
    results = sorted(results, key=lambda x: x[1], reverse=True)
    return results[:k]

In [7]:
# Load metadata
metadata = get_metadata()

# Create sample users
old_user_log = pd.DataFrame(
        {
            "object_id": [1258, 146, 150, 1155],
            "timestamp": [
                "2024-04-28 23:12:04.378821",
                "2024-04-28 23:12:10.378821",
                "2024-04-28 23:13:04.378821",
                "2024-04-28 23:13:10.378821",
            ],
        }
    )
# favorite artist, Favorite Art Format, Favorite Mediums, Favorite Art Style, Favorite Genre, Favorite Period, Favorite Color, Cultural Interests
# sample_new_user_log = ['Claude Monet', 'Paintings', 'Oil on canvas', 'Impressionism', 'Landscape', 'Late 19th Century', 'Blue', 'European']
sample_new_user_log = {
    "artists": ["Claude Monet", "Auguste Renoir"],
    "styles": ["Impressionism", "Brushwork", "Clarity"],
    "themes": ["Nature", "landscapes", "Tranquility"],
    "movements": ["Impressionism", "Dutch Golden Age"]
}

sample_new_user_log2 = {
    "artists": [""],
    "styles": ["Baroque", "Realism", "Portraiture"],
    "themes": ['Nobility', 'Status', 'Youth', 'Aristocracy', 'Individuality'],
    "movements": ['Baroque', 'Northern Renaissance']
}

In [8]:
# clean the artist names
remove_parentheses(metadata['artist_display'].iloc[0])

# get the tags of the artwork
metadata_tags = get_artwork_tags(metadata)

# get the embeddings of the artwork
artwork_embeddings = list(map(create_embeddings, metadata_tags))

In [9]:
# get the embeddings of the sample user
sample_new_user_embeddings = create_embeddings(sample_new_user_log)

# get the top 50 recommendations for the sample user based on consine similarity
top_10_results = get_top_k_similar_artworks(sample_new_user_embeddings, artwork_embeddings, 50)

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_50_similar_artworks_user1_consine.jpg"), metadata.iloc[[x[0] for x in top_10_results]]['image_id'].values, nrow=5, save_images = True)

# get the top 50 recommendations for the sample user based on euclidean similarity
top_10_results = get_top_k_similar_artworks(sample_new_user_embeddings, artwork_embeddings, 50, 'euclidean')

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_50_similar_artworks_user1_euclidean.jpg"), metadata.iloc[[x[0] for x in top_10_results]]['image_id'].values, nrow=5, save_images = True)

# get the top 50 recommendations for the sample user based on dot product similarity
top_10_results = get_top_k_similar_artworks(sample_new_user_embeddings, artwork_embeddings, 50, 'dot')

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_50_similar_artworks_user1_dot.jpg"), metadata.iloc[[x[0] for x in top_10_results]]['image_id'].values, nrow=5, save_images = True)

Image saved as output/top_50_similar_artworks_user1_consine.jpg
Image saved as output/top_50_similar_artworks_user1_euclidean.jpg
Image saved as output/top_50_similar_artworks_user1_dot.jpg


In [10]:
# get the embeddings of another sample user
sample_new_user_embeddings2 = create_embeddings(sample_new_user_log2)

# get the top 50 recommendations for the second sample user based on consine similarity
top_10_results2 = get_top_k_similar_artworks(sample_new_user_embeddings2, artwork_embeddings, 50)

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_10_similar_artworks_user2_cosine.jpg"), metadata.iloc[[x[0] for x in top_10_results2]]['image_id'].values, nrow=5, save_images = True)

# get the top 50 recommendations for the second sample user based on euclidean similarity
top_10_results2 = get_top_k_similar_artworks(sample_new_user_embeddings2, artwork_embeddings, 50, 'euclidean')

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_10_similar_artworks_user2_euclidean.jpg"), metadata.iloc[[x[0] for x in top_10_results2]]['image_id'].values, nrow=5, save_images = True)

# get the top 50 recommendations for the second sample user based on dot product similarity
top_10_results2 = get_top_k_similar_artworks(sample_new_user_embeddings2, artwork_embeddings, 50, 'dot')

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_10_similar_artworks_user2_dot.jpg"), metadata.iloc[[x[0] for x in top_10_results2]]['image_id'].values, nrow=5, save_images = True)

Image saved as output/top_10_similar_artworks_user2_cosine.jpg
Image saved as output/top_10_similar_artworks_user2_euclidean.jpg
Image saved as output/top_10_similar_artworks_user2_dot.jpg


In [11]:
sample_new_user_log3 = {
    "artists": ["Claude Monet", "Nicolaes Maes", "Franco-Flemish 15th Century", "Rembrandt", "Pablo Picasso", "Vincent van Gogh", "Leonardo da Vinci", "Georgia O'Keeffe"],
    "styles": ["Impressionism", "Realism", "Chiaroscuro", "Early Renaissance", "Cubism", "Post-Impressionism", "Surrealism", "Modernism"],
    "themes": ["Nature", "landscapes", "Tranquility", "Everyday Life", "Aging", "Religious Themes", "Abstract", "Still Life", "Portraits", "Fantasy", "Urban Life"],
    "movements": ["Impressionism", "Dutch Golden Age", "Northern Renaissance", "Early Netherlandish", "Modern Art", "Post-Impressionism", "Renaissance", "American Modernism"]
}


In [12]:
# get the embeddings of another sample user
sample_new_user_embeddings3 = create_embeddings(sample_new_user_log3)

# get the top 50 recommendations for the second sample user based on consine similarity
top_10_results3 = get_top_k_similar_artworks(sample_new_user_embeddings3, artwork_embeddings, 50)

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_50_similar_artworks_user3_cosine.jpg"), metadata.iloc[[x[0] for x in top_10_results3]]['image_id'].values, nrow=5, save_images = True)

# get the top 50 recommendations for the second sample user based on euclidean similarity
top_10_results3 = get_top_k_similar_artworks(sample_new_user_embeddings3, artwork_embeddings, 50, 'euclidean')

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_50_similar_artworks_user3_euclidean.jpg"), metadata.iloc[[x[0] for x in top_10_results3]]['image_id'].values, nrow=5, save_images = True)

# get the top 50 recommendations for the second sample user based on dot product similarity
top_10_results3 = get_top_k_similar_artworks(sample_new_user_embeddings3, artwork_embeddings, 50, 'dot')

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_50_similar_artworks_user3_dot.jpg"), metadata.iloc[[x[0] for x in top_10_results3]]['image_id'].values, nrow=5, save_images = True)

Image saved as output/top_50_similar_artworks_user3_cosine.jpg
Image saved as output/top_50_similar_artworks_user3_euclidean.jpg
Image saved as output/top_50_similar_artworks_user3_dot.jpg


In [13]:
# a function to merge the intro, overview, and style of the metadata dataframe into a paragraph, with seperation by new line
def merge_intro_overview_style(row):
    return row['intro'] + '\n' + row['overview'] + '\n' + row['style'] + '\n' + row['theme']

In [14]:
top_50_results_df = metadata.iloc[[x[0] for x in top_10_results3]].copy()
# top_50_results_df['new_tags'] = top_50_results_df.apply(lambda x: get_new_tags(x), axis=1)
# top_50_results_df['new_tags'] = top_50_results_df['new_tags'].apply(lambda x: ', '.join(x))
top_50_results_df['description'] = top_50_results_df.apply(lambda x: merge_intro_overview_style(x), axis=1)

In [15]:
top_50_tags3 = top_50_results_df['description'].values

In [16]:
top_50_embeddings3 = model.encode(top_50_tags3)

In [17]:
top_50_embeddings3.shape, top_50_tags3.shape

((50, 384), (50,))

In [18]:
clusterer = HDBSCAN(min_cluster_size=2, min_samples=1, metric='chebyshev')
cluster_labels = clusterer.fit_predict(top_50_embeddings3)

In [19]:
clustering_model = AgglomerativeClustering(
    n_clusters=None, distance_threshold=1.5
)  # , affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(top_50_embeddings3)
cluster_assignment = clustering_model.labels_

In [20]:
top_50_results_df['cluster_label'] = cluster_assignment

In [21]:
for cluster_id in top_50_results_df['cluster_label'].unique():
    result = top_50_results_df[top_50_results_df['cluster_label'] == cluster_id]
    display_images(os.path.join(OUTPUT_DIR, f"user3_exhibition_{cluster_id}.jpg"), result['image_id'].values, nrow=5, save_images = True)

Image saved as output/user3_exhibition_2.jpg
Image saved as output/user3_exhibition_1.jpg
Image saved as output/user3_exhibition_0.jpg


In [22]:
sample_new_user_log4 = {
    "artists": [
        "Claude Monet", "Nicolaes Maes", "Franco-Flemish 15th Century", 
        "Rembrandt", "Pablo Picasso", "Vincent van Gogh", "Leonardo da Vinci", 
        "Georgia O'Keeffe", "Jackson Pollock", "Frida Kahlo", "Salvador Dali", 
        "Andy Warhol", "Henri Matisse", "Edvard Munch", "Gustav Klimt"
    ],
    "styles": [
        "Impressionism", "Realism", "Chiaroscuro", "Early Renaissance", 
        "Cubism", "Post-Impressionism", "Surrealism", "Modernism", 
        "Abstract Expressionism", "Pop Art", "Expressionism", "Symbolism", 
        "Art Nouveau", "Baroque", "Fauvism"
    ],
    "themes": [
        "Nature", "landscapes", "Tranquility", "Everyday Life", "Aging", 
        "Religious Themes", "Abstract", "Still Life", "Portraits", "Fantasy", 
        "Urban Life", "Mythology", "Dreams", "Identity", "Conflict", 
        "Love", "Death", "Political Commentary"
    ],
    "movements": [
        "Impressionism", "Dutch Golden Age", "Northern Renaissance", 
        "Early Netherlandish", "Modern Art", "Post-Impressionism", "Renaissance", 
        "American Modernism", "Abstract Expressionism", "Pop Art", "Surrealism", 
        "Expressionism", "Symbolism", "Art Nouveau", "Baroque"
    ]
}

In [23]:
# get the embeddings of another sample user
sample_new_user_embeddings4 = create_embeddings(sample_new_user_log4)

# get the top 50 recommendations for the second sample user based on consine similarity
top_10_results4 = get_top_k_similar_artworks(sample_new_user_embeddings4, artwork_embeddings, 55)

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_55_similar_artworks_user4_cosine.jpg"), metadata.iloc[[x[0] for x in top_10_results4]]['image_id'].values, nrow=5, save_images = True)

# get the top 50 recommendations for the second sample user based on euclidean similarity
top_10_results4 = get_top_k_similar_artworks(sample_new_user_embeddings4, artwork_embeddings, 55, 'euclidean')

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_55_similar_artworks_user4_euclidean.jpg"), metadata.iloc[[x[0] for x in top_10_results4]]['image_id'].values, nrow=5, save_images = True)

# get the top 50 recommendations for the second sample user based on dot product similarity
top_10_results4 = get_top_k_similar_artworks(sample_new_user_embeddings4, artwork_embeddings, 55, 'dot')

# save the image of the top 50 most similar artworks
display_images(os.path.join(OUTPUT_DIR, "top_55_similar_artworks_user4_dot.jpg"), metadata.iloc[[x[0] for x in top_10_results4]]['image_id'].values, nrow=5, save_images = True)

Image saved as output/top_55_similar_artworks_user4_cosine.jpg
Image saved as output/top_55_similar_artworks_user4_euclidean.jpg
Image saved as output/top_55_similar_artworks_user4_dot.jpg


In [24]:
top_50_results_df = metadata.iloc[[x[0] for x in top_10_results4]].copy()
top_50_results_df['description'] = top_50_results_df.apply(lambda x: merge_intro_overview_style(x), axis=1)
top_50_tags4 = top_50_results_df['description'].values
top_50_embeddings4 = model.encode(top_50_tags4)

In [29]:
clustering_model2 = AgglomerativeClustering(
    n_clusters=None, distance_threshold=1.3
)  # , affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model2.fit(top_50_embeddings4)
cluster_assignment = clustering_model2.labels_
top_50_results_df['cluster_label'] = cluster_assignment

In [30]:
for cluster_id in top_50_results_df['cluster_label'].unique():
    result = top_50_results_df[top_50_results_df['cluster_label'] == cluster_id]
    display_images(os.path.join(OUTPUT_DIR, f"user4_exhibition_{cluster_id}.jpg"), result['image_id'].values, nrow=5, save_images = True)

Image saved as output/user4_exhibition_2.jpg
Image saved as output/user4_exhibition_1.jpg
Image saved as output/user4_exhibition_4.jpg
Image saved as output/user4_exhibition_3.jpg
Image saved as output/user4_exhibition_0.jpg
Image saved as output/user4_exhibition_5.jpg
