In [1]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
!pip install fasttext



In [3]:
!pip install gdown

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Download the FastText model file if it doesn't exist
import os
import gdown
if not os.path.exists('cc.en.300.bin'):
        url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
        output = 'cc.en.300.bin.gz'
        gdown.download(url, output, quiet=False)

        with gzip.open(output, 'rb') as f_in:
            with open('cc.en.300.bin', 'wb') as f_out:
                f_out.write(f_in.read())

Downloading...
From: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
To: /content/cc.en.300.bin.gz
100%|██████████| 4.50G/4.50G [00:26<00:00, 169MB/s]


In [4]:
import fasttext.util

model_path = 'cc.en.300.bin'

model = fasttext.load_model(model_path)

# Get the word embeddings
embeddings = model.get_input_matrix()

# Create a dictionary to store the word embeddings
documents = []
word_embeddings = {}

# Populate the dictionary with word embeddings
for word, vector in zip(model.get_words(), embeddings):
    word_embeddings[word] = vector



In [5]:
print(len(word_embeddings.keys()))

2000000


In [68]:
import random

def pick_random_pairs(dictionary, num_pairs):
    keys = list(dictionary.keys())
    random.shuffle(keys)
    random_pairs = {}
    for key in keys[:num_pairs]:
        random_pairs[key] = dictionary[key]
    return random_pairs

random_embeddings = pick_random_pairs(word_embeddings,1000000)

In [69]:
print(len(random_embeddings.keys()))

1000000


In [70]:
import numpy as np
from sklearn.preprocessing import normalize

def normalize_word_embeddings(word_embeddings):
    # Extract the word vectors and store them in a numpy array
    embeddings = np.array(list(word_embeddings.values()))

    # Normalize the word embeddings
    normalized_embeddings = normalize(embeddings)

    # Update the normalized embeddings back in the dictionary
    for i, word in enumerate(word_embeddings.keys()):
        word_embeddings[word] = normalized_embeddings[i]

    return word_embeddings

In [71]:
# Call the function to get normalized word embeddings
normalized_embeddings = normalize_word_embeddings(random_embeddings)

In [7]:
conda install -c conda-forge faiss

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



## Package Plan ##

  environment location: /Users/akallaku/anaconda3

  added / updated specs:
    - faiss


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    faiss-1.7.4                |py310h2129542_0_cpu         1.2 MB  conda-forge
    libblas-3.9.0              |16_osx64_openblas          13 KB  conda-forge
    libcxx-16.0.4              |       hd57cbcb_0         1.1 MB  conda-forge
    libfaiss-avx2-1.7.4        |   h1234567_0_cpu         1.4 MB  conda-forge
    liblapack-3.9.0            |16_osx64_openblas          13 KB  conda-forge
    llvm-openmp-16.0.4         |

In [72]:
import numpy as np
import faiss

'''
num_clusters = 256  
quantizer = faiss.IndexFlatL2(embedding_vectors.shape[1])
index = faiss.IndexIVFFlat(quantizer, embedding_vectors.shape[1], num_clusters)
index.train(embedding_vectors)
index.add(embedding_vectors)
_, similar_indices = index.search(embedding_vectors, num_similar)

'''

def calculate_fractal_value(embeddings, box_size, k):
    
    embedding_vectors = np.array(list(embeddings.values()))
    
    # Initialize Faiss index
    embedding_dim = embedding_vectors.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)

    # Add embeddings to the Faiss index
    index.add(embedding_vectors)

    # Search for the nearest neighbors of all vectors
    neighbor_distances , neighbor_indices = index.search(embedding_vectors, k)
    
    # Calculating similarity values using the neighbor_distances and updating it.
    for i in range(neighbor_distances.shape[0]):
        for j in range(neighbor_distances.shape[1]):
            neighbor_distances[i][j] = 1/(1+neighbor_distances[i][j])
    
    print(neighbor_distances)

    # Apply box counting algorithm for fractal value calculation
    num_boxes = 0
    num_filled_boxes = 0

    # Iterate over the similarity scores in chunks of box_size
    for i in range(0, embedding_vectors.shape[0], box_size):
        box_scores = neighbor_distances[i:i+box_size, :]

        for i in range(box_scores.shape[0]):
            for j in range(box_scores.shape[1]):
                if(box_scores[i][j]>0.5):
                    num_filled_boxes += 1

        num_boxes+=1
        
    print(num_filled_boxes,num_boxes)
    fractal_dimension = np.log(num_filled_boxes) / np.log(num_boxes)

    return fractal_dimension

In [73]:
%%time
fractal_dim = calculate_fractal_value(normalized_embeddings,10,1000)
print("Fractal Dimension:", fractal_dim)

[[1.         0.7011524  0.685969   ... 0.5567718  0.55668056 0.55666715]
 [0.99999976 0.7343499  0.7195218  ... 0.5137563  0.51374036 0.513722  ]
 [1.         0.5822523  0.57866114 ... 0.50839394 0.5083907  0.50836855]
 ...
 [0.99999976 0.7501614  0.69768    ... 0.5790898  0.5790858  0.5790703 ]
 [0.99999976 0.57441515 0.5616058  ... 0.501563   0.50154746 0.50149417]
 [0.99999976 0.60877454 0.5941156  ... 0.5035223  0.5035097  0.50345963]]
778697431 100000
Fractal Dimension: 1.7782737483835949
CPU times: user 8h 48min 28s, sys: 27min 34s, total: 9h 16min 2s
Wall time: 3h 51min 58s
