In [1]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
conda install -c conda-forge fasttext

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



## Package Plan ##

  environment location: /Users/akallaku/anaconda3

  added / updated specs:
    - fasttext


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2023.5.7   |       h8857fd0_0         145 KB  conda-forge
    certifi-2023.5.7           |     pyhd8ed1ab_0         149 KB  conda-forge
    fasttext-0.9.2             |  py310ha23aa8a_5         445 KB  conda-forge
    openssl-1.1.1u             |       h8a1eda9_0         1.7 MB  conda-forge
    pybind11-2.10.4            |  py310ha23aa8a_0         176 KB  conda-forge
    pybind11-global-2.10.4     

In [3]:
!pip install gdown

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1


In [4]:
# Download the FastText model file if it doesn't exist
import os
import gdown
if not os.path.exists('cc.en.300.bin'):
        url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
        output = 'cc.en.300.bin.gz'
        gdown.download(url, output, quiet=False)

        with gzip.open(output, 'rb') as f_in:
            with open('cc.en.300.bin', 'wb') as f_out:
                f_out.write(f_in.read())

Downloading...
From: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
To: /Users/akallaku/Downloads/cc.en.300.bin.gz
100%|██████████████████████████████████████| 4.50G/4.50G [01:42<00:00, 44.1MB/s]


In [1]:
import fasttext.util

model_path = 'cc.en.300.bin'

model = fasttext.load_model(model_path)

# Get the word embeddings
embeddings = model.get_input_matrix()

# Create a dictionary to store the word embeddings
documents = []
word_embeddings = {}

# Populate the dictionary with word embeddings
for word, vector in zip(model.get_words(), embeddings):
    word_embeddings[word] = vector



In [2]:
print(len(word_embeddings.keys()))

2000000


In [29]:
import random

def pick_random_pairs(dictionary, num_pairs):
    keys = list(dictionary.keys())
    random.shuffle(keys)
    random_pairs = {}
    for key in keys[:num_pairs]:
        random_pairs[key] = dictionary[key]
    return random_pairs

random_embeddings = pick_random_pairs(word_embeddings,1000000)

In [30]:
print(len(random_embeddings.keys()))

1000000


In [31]:
import numpy as np
from sklearn.preprocessing import normalize

def normalize_word_embeddings(word_embeddings):
    # Extract the word vectors and store them in a numpy array
    embeddings = np.array(list(word_embeddings.values()))

    # Normalize the word embeddings
    normalized_embeddings = normalize(embeddings)

    # Update the normalized embeddings back in the dictionary
    for i, word in enumerate(word_embeddings.keys()):
        word_embeddings[word] = normalized_embeddings[i]

    return word_embeddings

In [32]:
# Call the function to get normalized word embeddings
normalized_embeddings = normalize_word_embeddings(random_embeddings)

In [7]:
conda install -c conda-forge faiss

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [33]:
import numpy as np
import faiss

'''
num_clusters = 256  
quantizer = faiss.IndexFlatL2(embedding_vectors.shape[1])
index = faiss.IndexIVFFlat(quantizer, embedding_vectors.shape[1], num_clusters)
index.train(embedding_vectors)
index.add(embedding_vectors)
_, similar_indices = index.search(embedding_vectors, num_similar)

'''

def calculate_fractal_value(embeddings, box_size, k):
    
    embedding_vectors = np.array(list(embeddings.values()))
    
    # Initialize Faiss index
    embedding_dim = embedding_vectors.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)

    # Add embeddings to the Faiss index
    index.add(embedding_vectors)

    # Search for the nearest neighbors of all vectors
    neighbor_distances , neighbor_indices = index.search(embedding_vectors, k)
    
    # Calculating similarity values using the neighbor_distances and updating it.
    for i in range(neighbor_distances.shape[0]):
        for j in range(neighbor_distances.shape[1]):
            neighbor_distances[i][j] = 1/(1+neighbor_distances[i][j])
    
    print(neighbor_distances)

    #Resultant fractal dimension array
    fractal_dimensions = []

    for i in range(embedding_vectors.shape[0]):
        num_boxes = 0
        num_filled_boxes = 0

        # Iterate over the similarity scores of each vector in chunks of box_size
        for j in range(0,neighbor_distances.shape[1],box_size):
            box_scores = neighbor_distances[i,j:j+box_size]
            
            for score in box_scores:
                if(score>0.5):
                    num_filled_boxes += 1

            num_boxes+=1

        fractal_dimension = np.log(num_filled_boxes) / np.log(num_boxes)
        fractal_dimensions.append(fractal_dimension)

    return fractal_dimensions

In [34]:
%%time
fractal_dimensions = calculate_fractal_value(normalized_embeddings,10,1000)
print("Fractal Dimension:", fractal_dimensions)

[[1.         0.55930865 0.5567445  ... 0.49860162 0.49857584 0.49856958]
 [1.         0.60488784 0.597303   ... 0.49126327 0.4912624  0.49124846]
 [0.9999995  0.67742634 0.66604745 ... 0.54839164 0.5483559  0.5483506 ]
 ...
 [1.         0.6356718  0.58729595 ... 0.47331423 0.4732174  0.4732162 ]
 [1.         0.8936764  0.86792403 ... 0.55056363 0.5505435  0.5505313 ]
 [1.         0.71451384 0.6806422  ... 0.55842215 0.55840087 0.5583906 ]]


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [35]:
with open('Fractal_dimension_polysemous_words_1M_1000_10', 'w') as f:
    for dimension in fractal_dimensions:
        f.write(str(dimension) + '\n')

In [37]:
print(len(open('Fractal_dimension_polysemous_words_1M_1000_10', 'r').readlines()))

1000000
