In [1]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from gensim.models import KeyedVectors

# Download the pre-trained word2vec model from Google's Word2Vec
word2vec_file_path = './word2vec_file.bin'

# Downloading the model (example for a 300-dimensional model)
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

# Save the model in the binary format
model.save_word2vec_format(word2vec_file_path, binary=True)



In [4]:
from gensim.models import KeyedVectors

def import_word2vec_embeddings(file_path):
    # Load the pre-trained word2vec model
    model = KeyedVectors.load_word2vec_format(file_path, binary=True)
    
    # Create an empty dictionary to store the word embeddings
    word_embeddings = {}

    # Populate the dictionary with word embeddings
    for word in model.key_to_index:
        word_embeddings[word] = model.get_vector(word)
    
    return word_embeddings

# Provide the file path to your word2vec file
word2vec_file_path = './word2vec_file.bin'

# Call the function to import word2vec embeddings into a dictionary
word_embeddings = import_word2vec_embeddings(word2vec_file_path)

In [5]:
print(len(word_embeddings.keys()))

3000000


In [6]:
import random

def pick_random_pairs(dictionary, num_pairs):
    keys = list(dictionary.keys())
    random.shuffle(keys)
    random_pairs = {}
    for key in keys[:num_pairs]:
        random_pairs[key] = dictionary[key]
    return random_pairs

random_embeddings = pick_random_pairs(word_embeddings,1000000)

In [7]:
print(len(random_embeddings.keys()))

1000000


In [8]:
import numpy as np
from sklearn.preprocessing import normalize

def normalize_word_embeddings(word_embeddings):
    # Extract the word vectors and store them in a numpy array
    embeddings = np.array(list(word_embeddings.values()))

    # Normalize the word embeddings
    normalized_embeddings = normalize(embeddings)

    # Update the normalized embeddings back in the dictionary
    for i, word in enumerate(word_embeddings.keys()):
        word_embeddings[word] = normalized_embeddings[i]

    return word_embeddings

In [9]:
# Call the function to get normalized word embeddings
normalized_embeddings = normalize_word_embeddings(random_embeddings)

In [10]:
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [11]:
import numpy as np
import faiss

'''
num_clusters = 256  
quantizer = faiss.IndexFlatL2(embedding_vectors.shape[1])
index = faiss.IndexIVFFlat(quantizer, embedding_vectors.shape[1], num_clusters)
index.train(embedding_vectors)
index.add(embedding_vectors)
_, similar_indices = index.search(embedding_vectors, num_similar)

'''

def calculate_fractal_value(embeddings, box_size, k):
    
    embedding_vectors = np.array(list(embeddings.values()))
    
    # Initialize Faiss index
    embedding_dim = embedding_vectors.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)

    # Add embeddings to the Faiss index
    index.add(embedding_vectors)

    # Search for the nearest neighbors of all vectors
    neighbor_distances , neighbor_indices = index.search(embedding_vectors, k)
    
    # Calculating similarity values using the neighbor_distances and updating it.
    for i in range(neighbor_distances.shape[0]):
        for j in range(neighbor_distances.shape[1]):
            neighbor_distances[i][j] = 1/(1+neighbor_distances[i][j])
    
    print(neighbor_distances)

    # Apply box counting algorithm for fractal value calculation
    num_boxes = 0
    num_filled_boxes = 0

    # Iterate over the similarity scores in chunks of box_size
    for i in range(0, embedding_vectors.shape[0], box_size):
        box_scores = neighbor_distances[i:i+box_size, :]

        for i in range(box_scores.shape[0]):
            for j in range(box_scores.shape[1]):
                if(box_scores[i][j]>0.5):
                    num_filled_boxes += 1

        num_boxes+=1
        
    print(num_filled_boxes,num_boxes)
    fractal_dimension = np.log(num_filled_boxes) / np.log(num_boxes)
    return fractal_dimension

In [12]:
%%time
fractal_dim = calculate_fractal_value(normalized_embeddings,10,1000)
print("Fractal Dimension:", fractal_dim)

[[0.9999999  0.53470135 0.52227837 ... 0.45049554 0.45048887 0.45048872]
 [1.         0.547726   0.5393408  ... 0.4523802  0.45237556 0.45236173]
 [1.         0.7051065  0.68094784 ... 0.5217019  0.5216767  0.52166843]
 ...
 [0.9999999  0.65995324 0.6590021  ... 0.52847683 0.5284419  0.52831197]
 [0.99999976 0.5050519  0.5028653  ... 0.45966485 0.4596595  0.45965248]
 [0.99999976 0.6395914  0.62621284 ... 0.5427284  0.54272753 0.5426817 ]]
442208944 100000
Fractal Dimension: 1.729125504451146
CPU times: user 8h 29min 58s, sys: 42.7 s, total: 8h 30min 40s
Wall time: 4h 22min 9s
