In [1]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
conda install -c conda-forge fasttext
#use (!pip install fasttext) with collab

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install gdown



In [4]:
# Download the FastText model file if it doesn't exist
import os
import gdown
if not os.path.exists('cc.en.300.bin'):
        url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
        output = 'cc.en.300.bin.gz'
        gdown.download(url, output, quiet=False)

        with gzip.open(output, 'rb') as f_in:
            with open('cc.en.300.bin', 'wb') as f_out:
                f_out.write(f_in.read())

In [5]:
import fasttext.util

model_path = 'cc.en.300.bin'

model = fasttext.load_model(model_path)

# Get the word embeddings
embeddings = model.get_input_matrix()

# Create a dictionary to store the word embeddings
documents = []
word_embeddings = {}

# Populate the dictionary with word embeddings
for word, vector in zip(model.get_words(), embeddings):
    word_embeddings[word] = vector



In [6]:
print(len(word_embeddings.keys()))

2000000


In [9]:
def pick_word_embeddings(non_polysemous):
    non_polysemous_embeddings = {}
    for word in non_polysemous:
        if(word in word_embeddings):
            non_polysemous_embeddings[word] = word_embeddings[word]
    
    return non_polysemous_embeddings

In [10]:
non_polysemous = [
    "banana", "guitar", "elephant", "chair", "diamond", "piano", "lemon", "mountain", "book", "sun",
    "umbrella", "river", "butterfly", "tree", "carrot", "moon", "flower", "ocean", "computer", "lamp",
    "coffee", "bird", "bicycle", "cookie", "beach", "dog", "rainbow", "camera", "island", "hat",
    "turtle", "clock", "socks", "candle", "fire", "garden", "orange", "star", "bridge", "key",
    "castle", "shoe", "dolphin", "planet", "spoon", "feather", "butter", "rocket", "pillow", "chocolate",
    "honey", "volcano", "whale", "moonlight", "wallet", "pineapple", "flag", "fountain", "tiger", "map",
    "sweater", "music", "airplane", "globe", "painting", "toothbrush", "helicopter", "snail", "statue", "cupcake",
    "seashell", "peacock", "drum", "cloud", "cactus", "feather", "balloon", "kangaroo", "moonshine", "mailbox",
    "raincoat", "pinecone", "lighthouse", "tornado", "volleyball", "seagull", "whistle", "accordion", "tadpole", "giraffe",
    "typewriter", "caterpillar", "chimney", "waffle", "suitcase", "butterfly", "dragonfly", "toothpaste", "saxophone", "doorknob"
]

non_polysemous_embeddings = pick_word_embeddings(non_polysemous)

In [12]:
len(non_polysemous_embeddings.keys())

98

In [15]:
import numpy as np
from sklearn.preprocessing import normalize

def normalize_word_embeddings(word_embeddings):
    # Extract the word vectors and store them in a numpy array
    embeddings = np.array(list(word_embeddings.values()))

    # Normalize the word embeddings
    normalized_embeddings = normalize(embeddings)

    # Update the normalized embeddings back in the dictionary
    for i, word in enumerate(word_embeddings.keys()):
        word_embeddings[word] = normalized_embeddings[i]

    return word_embeddings

In [16]:
# Call the function to get normalized word embeddings
normalized_embeddings = normalize_word_embeddings(non_polysemous_embeddings)

In [17]:
conda install -c conda-forge faiss
#use (!pip install faiss-gpu) with collab

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [21]:
import numpy as np
import faiss

'''
num_clusters = 256  
quantizer = faiss.IndexFlatL2(embedding_vectors.shape[1])
index = faiss.IndexIVFFlat(quantizer, embedding_vectors.shape[1], num_clusters)
index.train(embedding_vectors)
index.add(embedding_vectors)
_, similar_indices = index.search(embedding_vectors, num_similar)

'''

def calculate_fractal_value(embeddings, box_size, k):
    
    embedding_vectors = np.array(list(embeddings.values()))
    
    # Initialize Faiss index
    embedding_dim = embedding_vectors.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)

    # Add embeddings to the Faiss index
    index.add(embedding_vectors)

    # Search for the nearest neighbors of all vectors
    neighbor_distances , neighbor_indices = index.search(embedding_vectors, k)
    
    # Calculating similarity values using the neighbor_distances and updating it.
    for i in range(neighbor_distances.shape[0]):
        for j in range(neighbor_distances.shape[1]):
            neighbor_distances[i][j] = 1/(1+neighbor_distances[i][j])
    
    #print(neighbor_distances)

    #Resultant fractal dimension array
    fractal_dimensions = []

    for i in range(embedding_vectors.shape[0]):
        num_boxes = 0
        num_filled_boxes = 0

        # Iterate over the similarity scores of each vector in chunks of box_size
        for j in range(0,neighbor_distances.shape[1],box_size):
            box_scores = neighbor_distances[i,j:j+box_size]
            
            for score in box_scores:
                if(score>0.5):
                    num_filled_boxes += 1

            num_boxes+=1

        fractal_dimension = np.log(num_filled_boxes) / np.log(num_boxes)
        fractal_dimensions.append(fractal_dimension)

    return fractal_dimensions

In [22]:
%%time
fractal_dimensions = calculate_fractal_value(normalized_embeddings,10,10)
print("Fractal Dimension:", fractal_dimensions)

Fractal Dimension: [inf, inf, inf, nan, nan, inf, inf, inf, nan, inf, inf, inf, inf, inf, inf, inf, inf, inf, nan, inf, inf, inf, nan, inf, inf, nan, nan, nan, inf, inf, inf, nan, inf, inf, nan, inf, inf, nan, inf, nan, inf, inf, inf, inf, nan, inf, inf, nan, nan, inf, inf, inf, inf, inf, inf, inf, nan, inf, inf, nan, inf, inf, inf, inf, nan, inf, inf, inf, inf, inf, inf, inf, nan, nan, inf, nan, nan, nan, nan, inf, inf, inf, nan, inf, inf, nan, inf, inf, inf, nan, inf, nan, nan, inf, inf, inf, inf, inf]
CPU times: user 17 ms, sys: 4.5 ms, total: 21.4 ms
Wall time: 22.2 ms


  fractal_dimension = np.log(num_filled_boxes) / np.log(num_boxes)
  fractal_dimension = np.log(num_filled_boxes) / np.log(num_boxes)


In [23]:
with open('Fractal_dimension_non_polysemous_words_100_10_10', 'w') as f:
    for dimension in fractal_dimensions:
        f.write(str(dimension) + '\n')

In [26]:
print(len(open('Fractal_dimension_non_polysemous_words_100_10_10', 'r').readlines()))

98
