In [1]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
#use (!pip install fasttext) with collab
conda install -c conda-forge fasttext

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install gdown



In [4]:
# Download the FastText model file if it doesn't exist
import os
import gdown
if not os.path.exists('cc.en.300.bin'):
        url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
        output = 'cc.en.300.bin.gz'
        gdown.download(url, output, quiet=False)

        with gzip.open(output, 'rb') as f_in:
            with open('cc.en.300.bin', 'wb') as f_out:
                f_out.write(f_in.read())

In [3]:
import fasttext.util

model_path = 'cc.en.300.bin'

model = fasttext.load_model(model_path)

# Get the word embeddings
embeddings = model.get_input_matrix()

# Create a dictionary to store the word embeddings
documents = []
word_embeddings = {}

# Populate the dictionary with word embeddings
for word, vector in zip(model.get_words(), embeddings):
    word_embeddings[word] = vector



In [4]:
print(len(word_embeddings.keys()))

2000000


In [5]:
def pick_word_embeddings(non_polysemous):
    non_polysemous_embeddings = {}
    for word in non_polysemous:
        if(word in word_embeddings):
            non_polysemous_embeddings[word] = word_embeddings[word]
    
    return non_polysemous_embeddings

In [6]:
non_polysemous = [
    "banana", "guitar", "elephant", "chair", "diamond", "piano", "lemon", "mountain", "book", "sun",
    "umbrella", "river", "butterfly", "tree", "carrot", "moon", "flower", "ocean", "computer", "lamp",
    "coffee", "bird", "bicycle", "cookie", "beach", "dog", "rainbow", "camera", "island", "hat",
    "turtle", "clock", "socks", "candle", "fire", "garden", "orange", "star", "bridge", "key",
    "castle", "shoe", "dolphin", "planet", "spoon", "feather", "butter", "rocket", "pillow", "chocolate",
    "honey", "volcano", "whale", "moonlight", "wallet", "pineapple", "flag", "fountain", "tiger", "map",
    "sweater", "music", "airplane", "globe", "painting", "toothbrush", "helicopter", "snail", "statue", "cupcake",
    "seashell", "peacock", "drum", "cloud", "cactus", "feather", "balloon", "kangaroo", "moonshine", "mailbox",
    "raincoat", "pinecone", "lighthouse", "tornado", "volleyball", "seagull", "whistle", "accordion", "tadpole", "giraffe",
    "typewriter", "caterpillar", "chimney", "waffle", "suitcase", "butterfly", "dragonfly", "toothpaste", "saxophone", "doorknob"
]

non_polysemous_embeddings = pick_word_embeddings(non_polysemous)

In [7]:
len(non_polysemous_embeddings.keys())

98

In [8]:
import numpy as np
from sklearn.preprocessing import normalize

def normalize_word_embeddings(word_embeddings):
    # Extract the word vectors and store them in a numpy array
    embeddings = np.array(list(word_embeddings.values()))

    # Normalize the word embeddings
    normalized_embeddings = normalize(embeddings)

    # Update the normalized embeddings back in the dictionary
    for i, word in enumerate(word_embeddings.keys()):
        word_embeddings[word] = normalized_embeddings[i]

    return word_embeddings

In [9]:
# Call the function to get normalized word embeddings
normalized_embeddings = normalize_word_embeddings(non_polysemous_embeddings)

In [17]:
conda install -c conda-forge faiss
#use (!pip install faiss-gpu) with collab

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

def embedding_cosine_similarity(embedding1,embedding2):
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)
    return cosine_similarity(embedding1,embedding2)

In [13]:
import numpy as np
import faiss

'''
num_clusters = 256  
quantizer = faiss.IndexFlatL2(embedding_vectors.shape[1])
index = faiss.IndexIVFFlat(quantizer, embedding_vectors.shape[1], num_clusters)
index.train(embedding_vectors)
index.add(embedding_vectors)
_, similar_indices = index.search(embedding_vectors, num_similar)

'''

def calculate_fractal_value(embeddings, box_size, k):
    
    embedding_vectors = np.array(list(embeddings.values()))
    
    # Initialize Faiss index
    embedding_dim = embedding_vectors.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)

    # Add embeddings to the Faiss index
    index.add(embedding_vectors)

    # Search for the nearest neighbors of all vectors
    neighbor_distances , neighbor_indices = index.search(embedding_vectors, k)
    
    print(neighbor_indices)
    
    # Calculating cosine similarity values using the neighbor_distances and updating it.
    for i in range(neighbor_distances.shape[0]):
        for j in range(neighbor_distances.shape[1]):
            neighbor_distances[i][j] = embedding_cosine_similarity(embedding_vectors[i],embedding_vectors[neighbor_indices[i][j]])
    
    print(neighbor_distances)

    #Resultant fractal dimension array
    fractal_dimensions = []

    for i in range(embedding_vectors.shape[0]):
        num_boxes = 0
        num_filled_boxes = 0

        # Iterate over the similarity scores of each vector in chunks of box_size
        for j in range(0,neighbor_distances.shape[1],box_size):
            box_scores = neighbor_distances[i,j:j+box_size]
            
            for score in box_scores:
                if(score>0.5):
                    num_filled_boxes += 1

            num_boxes+=1

        fractal_dimension = np.log(num_filled_boxes) / np.log(num_boxes)
        fractal_dimensions.append(fractal_dimension)

    return fractal_dimensions

In [14]:
%%time
fractal_dimensions = calculate_fractal_value(normalized_embeddings,5,10)

[[ 0 55 14 36 49  6 20 46 69 50]
 [ 1  5 96 86 61 72 89 83 22 38]
 [ 2 88 58 52 42 30 76 70 71 25]
 [ 3 48 10 19 97 93 89 60 22 18]
 [ 4 37 70 16 55 45 15 41 69 94]
 [ 5 96  1 86 61 72 89 64 83 70]
 [ 6 36 55  0 14 50 46 74 49 69]
 [ 7 51 11 17 28 22 66 24 40 74]
 [ 8 62 89 59 48 93  3 60 64 20]
 [ 9 15 53 24 17 43 35 26 19 74]
 [10 79  3 29 26 93  9 73 48 19]
 [11 17 38  7 24 57 53 28 40 26]
 [12 94 16 90 21 80 71 87 45 30]
 [13 80 74 35 16 91  7 33 21 88]
 [14  0 55  6 36 74 69 76 49 23]
 [15  9 53 43 17 47 26 84 37 94]
 [16 74 12 35 80 94 70 69 55 71]
 [17 24 11 70 52 28 42 15  7 43]
 [18 89 62 27 61  3 97 31 20  5]
 [19 33 97  3 48 81 80 41 91 53]
 [20 49  0 69 92 55 50 77 46 33]
 [21 84 94 30 71 45 12 80 52 42]
 [22 62 41  7 79 93 89 65 66 35]
 [23 69 49 46 92 14 80 44  0 20]
 [24 17 28 70 81 83 11  9 84 42]
 [25 84 30 21  2 58 42 76 88 52]
 [26 36 15 71 94 12 16 11 73 10]
 [27 79 18 66 62 93 54 24 84 53]
 [28 81 24 17 51 40  7 43 11 55]
 [29 60 79 32 45 41 69 93 80 10]
 [30 42 67

In [16]:
print("Fractal Dimension:", fractal_dimensions)

Fractal Dimension: [2.321928094887362, 2.321928094887362, 2.321928094887362, 0.0, 0.0, 2.321928094887362, 1.5849625007211563, 1.0, 0.0, 1.5849625007211563, 1.0, 1.5849625007211563, 2.321928094887362, 1.0, 1.0, 1.5849625007211563, 2.584962500721156, 2.807354922057604, 0.0, 1.0, 1.0, 2.807354922057604, 0.0, 1.0, 2.584962500721156, 0.0, 0.0, 0.0, 2.0, 1.5849625007211563, 3.3219280948873626, 0.0, 1.5849625007211563, 1.0, 0.0, 1.0, 1.5849625007211563, 0.0, 1.0, 0.0, 1.0, 1.0, 3.0, 1.0, 0.0, 1.5849625007211563, 1.5849625007211563, 0.0, 0.0, 2.321928094887362, 1.0, 1.0, 2.807354922057604, 1.5849625007211563, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 2.0, 2.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.5849625007211563, 1.0, 1.5849625007211563, 2.584962500721156, 2.0, 0.0, 0.0, 1.5849625007211563, 0.0, 0.0, 0.0, 0.0, 2.321928094887362, 2.584962500721156, 2.0, 0.0, 1.0, 2.584962500721156, 0.0, 2.0, 2.321928094887362, 2.584962500721156, 0.0, 2.0, 0.0, 0.0, 1.5849625007211563, 3.3219280948873626, 1.0, 2.321928094887362, 1.0

In [17]:
with open('Fractal_dimension_non_polysemous_words_cosine_100_5_10', 'w') as f:
    for dimension in fractal_dimensions:
        f.write(str(dimension) + '\n')

In [19]:
print(len(open('Fractal_dimension_non_polysemous_words_cosine_100_5_10', 'r').readlines()))

98
