In [1]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
conda install -c conda-forge fasttext
#use (!pip install fasttext) with collab

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install gdown



In [4]:
# Download the FastText model file if it doesn't exist
import os
import gdown
if not os.path.exists('cc.en.300.bin'):
        url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
        output = 'cc.en.300.bin.gz'
        gdown.download(url, output, quiet=False)

        with gzip.open(output, 'rb') as f_in:
            with open('cc.en.300.bin', 'wb') as f_out:
                f_out.write(f_in.read())

In [5]:
import fasttext.util

model_path = 'cc.en.300.bin'

model = fasttext.load_model(model_path)

# Get the word embeddings
embeddings = model.get_input_matrix()

# Create a dictionary to store the word embeddings
documents = []
word_embeddings = {}

# Populate the dictionary with word embeddings
for word, vector in zip(model.get_words(), embeddings):
    word_embeddings[word] = vector



In [6]:
print(len(word_embeddings.keys()))

2000000


In [9]:
def pick_word_embeddings(non_polysemous):
    non_polysemous_embeddings = {}
    for word in non_polysemous:
        if(word in word_embeddings):
            non_polysemous_embeddings[word] = word_embeddings[word]
    
    return non_polysemous_embeddings

In [10]:
non_polysemous = [
    "banana", "guitar", "elephant", "chair", "diamond", "piano", "lemon", "mountain", "book", "sun",
    "umbrella", "river", "butterfly", "tree", "carrot", "moon", "flower", "ocean", "computer", "lamp",
    "coffee", "bird", "bicycle", "cookie", "beach", "dog", "rainbow", "camera", "island", "hat",
    "turtle", "clock", "socks", "candle", "fire", "garden", "orange", "star", "bridge", "key",
    "castle", "shoe", "dolphin", "planet", "spoon", "feather", "butter", "rocket", "pillow", "chocolate",
    "honey", "volcano", "whale", "moonlight", "wallet", "pineapple", "flag", "fountain", "tiger", "map",
    "sweater", "music", "airplane", "globe", "painting", "toothbrush", "helicopter", "snail", "statue", "cupcake",
    "seashell", "peacock", "drum", "cloud", "cactus", "feather", "balloon", "kangaroo", "moonshine", "mailbox",
    "raincoat", "pinecone", "lighthouse", "tornado", "volleyball", "seagull", "whistle", "accordion", "tadpole", "giraffe",
    "typewriter", "caterpillar", "chimney", "waffle", "suitcase", "butterfly", "dragonfly", "toothpaste", "saxophone", "doorknob"
]

non_polysemous_embeddings = pick_word_embeddings(non_polysemous)

In [27]:
len(non_polysemous_embeddings.keys())

98

In [15]:
import numpy as np
from sklearn.preprocessing import normalize

def normalize_word_embeddings(word_embeddings):
    # Extract the word vectors and store them in a numpy array
    embeddings = np.array(list(word_embeddings.values()))

    # Normalize the word embeddings
    normalized_embeddings = normalize(embeddings)

    # Update the normalized embeddings back in the dictionary
    for i, word in enumerate(word_embeddings.keys()):
        word_embeddings[word] = normalized_embeddings[i]

    return word_embeddings

In [16]:
# Call the function to get normalized word embeddings
normalized_embeddings = normalize_word_embeddings(non_polysemous_embeddings)

In [17]:
conda install -c conda-forge faiss
#use (!pip install faiss-gpu) with collab

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [30]:
import numpy as np
import faiss

'''
num_clusters = 256  
quantizer = faiss.IndexFlatL2(embedding_vectors.shape[1])
index = faiss.IndexIVFFlat(quantizer, embedding_vectors.shape[1], num_clusters)
index.train(embedding_vectors)
index.add(embedding_vectors)
_, similar_indices = index.search(embedding_vectors, num_similar)

'''

def calculate_fractal_value(embeddings, box_size, k):
    
    embedding_vectors = np.array(list(embeddings.values()))
    
    # Initialize Faiss index
    embedding_dim = embedding_vectors.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)

    # Add embeddings to the Faiss index
    index.add(embedding_vectors)

    # Search for the nearest neighbors of all vectors
    neighbor_distances , neighbor_indices = index.search(embedding_vectors, k)
    
    # Calculating similarity values using the neighbor_distances and updating it.
    for i in range(neighbor_distances.shape[0]):
        for j in range(neighbor_distances.shape[1]):
            neighbor_distances[i][j] = 1/(1+neighbor_distances[i][j])
    
    print(neighbor_distances)

    #Resultant fractal dimension array
    fractal_dimensions = []

    for i in range(embedding_vectors.shape[0]):
        num_boxes = 0
        num_filled_boxes = 0

        # Iterate over the similarity scores of each vector in chunks of box_size
        for j in range(0,neighbor_distances.shape[1],box_size):
            box_scores = neighbor_distances[i,j:j+box_size]
            
            for score in box_scores:
                if(score>0.5):
                    num_filled_boxes += 1

            num_boxes+=1

        fractal_dimension = np.log(num_filled_boxes) / np.log(num_boxes)
        fractal_dimensions.append(fractal_dimension)

    return fractal_dimensions

In [31]:
%%time
fractal_dimensions = calculate_fractal_value(normalized_embeddings,5,10)

[[0.9999999  0.60603386 0.5211427  0.50815773 0.50703686 0.49982756
  0.4698076  0.46579275 0.4645673  0.46301308]
 [1.         0.6581317  0.65242666 0.56353515 0.52065456 0.4974467
  0.45414054 0.43591365 0.4229159  0.4202017 ]
 [1.         0.6208895  0.57823974 0.52924216 0.52513665 0.4989824
  0.47959894 0.47592697 0.4739656  0.4606309 ]
 [0.99999976 0.47525698 0.47057778 0.4590307  0.44728136 0.4374813
  0.42956063 0.4291568  0.4261069  0.42476687]
 [1.         0.44931605 0.4448735  0.43613252 0.43193793 0.4314114
  0.4256567  0.42453325 0.41904935 0.41862467]
 [1.         0.6581356  0.6581317  0.58750373 0.54582864 0.47780707
  0.47352046 0.443944   0.42881244 0.4279795 ]
 [1.         0.55838406 0.53358525 0.49982756 0.49786347 0.48422334
  0.46267027 0.45796233 0.4527005  0.4475706 ]
 [0.99999976 0.5043197  0.49549836 0.47671738 0.46288696 0.45786965
  0.4500409  0.44903585 0.44875526 0.44683307]
 [1.         0.43504474 0.4242664  0.42384756 0.42142937 0.41945818
  0.41491842 0.4

In [32]:
print("Fractal Dimension:", fractal_dimensions)

Fractal Dimension: [2.321928094887362, 2.321928094887362, 2.321928094887362, 0.0, 0.0, 2.321928094887362, 1.5849625007211563, 1.0, 0.0, 1.5849625007211563, 1.0, 1.5849625007211563, 2.321928094887362, 1.0, 1.0, 1.5849625007211563, 2.584962500721156, 2.807354922057604, 0.0, 1.0, 1.0, 2.807354922057604, 0.0, 1.0, 2.584962500721156, 0.0, 0.0, 0.0, 2.0, 1.5849625007211563, 3.3219280948873626, 0.0, 1.5849625007211563, 1.0, 0.0, 1.0, 1.5849625007211563, 0.0, 1.0, 0.0, 1.0, 1.0, 3.0, 1.0, 0.0, 1.5849625007211563, 1.5849625007211563, 0.0, 0.0, 2.321928094887362, 1.0, 1.0, 2.807354922057604, 1.5849625007211563, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 2.0, 2.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.5849625007211563, 1.0, 1.5849625007211563, 2.584962500721156, 2.0, 0.0, 0.0, 1.5849625007211563, 0.0, 0.0, 0.0, 0.0, 2.321928094887362, 2.584962500721156, 2.0, 0.0, 1.0, 2.584962500721156, 0.0, 2.0, 2.321928094887362, 2.584962500721156, 0.0, 2.0, 0.0, 0.0, 1.5849625007211563, 3.3219280948873626, 1.0, 2.321928094887362, 1.0

In [33]:
with open('Fractal_dimension_non_polysemous_words_100_5_10', 'w') as f:
    for dimension in fractal_dimensions:
        f.write(str(dimension) + '\n')

In [34]:
print(len(open('Fractal_dimension_non_polysemous_words_100_5_10', 'r').readlines()))

98
