In [1]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
!pip install fasttext



In [3]:
pip install gdown

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Download the FastText model file if it doesn't exist
import os
import gdown
if not os.path.exists('cc.en.300.bin'):
        url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
        output = 'cc.en.300.bin.gz'
        gdown.download(url, output, quiet=False)

        with gzip.open(output, 'rb') as f_in:
            with open('cc.en.300.bin', 'wb') as f_out:
                f_out.write(f_in.read())

Downloading...
From: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
To: /content/cc.en.300.bin.gz
100%|██████████| 4.50G/4.50G [00:26<00:00, 169MB/s]


In [1]:
import fasttext.util

model_path = 'cc.en.300.bin'

model = fasttext.load_model(model_path)

# Get the word embeddings
embeddings = model.get_input_matrix()

# Create a dictionary to store the word embeddings
documents = []
word_embeddings = {}

# Populate the dictionary with word embeddings
for word, vector in zip(model.get_words(), embeddings):
    word_embeddings[word] = vector



In [2]:
print(len(word_embeddings.keys()))

2000000


In [3]:
#Kernel crashing when normalizing the fasttext word embeddings.
'''import fasttext.util
import gdown,gzip
import os
import math

def get_normalized_word_embeddings(word_embeddings):
    word_embedding_strings=[]
    
    for word in word_embeddings:
        vector = word_embeddings[word]
        squared_sum = sum(val ** 2 for val in vector)
        normalization_factor = math.sqrt(squared_sum)
        normalized_vector = [val / normalization_factor for val in vector]
        word_embeddings[word] = normalized_vector
        vector_str = ' '.join(str(val) for val in normalized_vector)
        word_embedding_strings.append(word+" "+vector_str)

    return word_embedding_strings

documents = get_normalized_word_embeddings(word_embeddings)'''

'import fasttext.util\nimport gdown,gzip\nimport os\nimport math\n\ndef get_normalized_word_embeddings(word_embeddings):\n    word_embedding_strings=[]\n    \n    for word in word_embeddings:\n        vector = word_embeddings[word]\n        squared_sum = sum(val ** 2 for val in vector)\n        normalization_factor = math.sqrt(squared_sum)\n        normalized_vector = [val / normalization_factor for val in vector]\n        word_embeddings[word] = normalized_vector\n        vector_str = \' \'.join(str(val) for val in normalized_vector)\n        word_embedding_strings.append(word+" "+vector_str)\n\n    return word_embedding_strings\n\ndocuments = get_normalized_word_embeddings(word_embeddings)'

In [4]:
'''import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopword = stopwords.words('english')

temp_embeddings = word_embeddings.copy()
for key in word_embeddings.keys():
    if key in stopword:
        del temp_embeddings[key]'''

"import nltk\nnltk.download('stopwords')\nfrom nltk.corpus import stopwords\nstopword = stopwords.words('english')\n\ntemp_embeddings = word_embeddings.copy()\nfor key in word_embeddings.keys():\n    if key in stopword:\n        del temp_embeddings[key]"

In [5]:
import random

def pick_random_pairs(dictionary, num_pairs):
    keys = list(dictionary.keys())
    random.shuffle(keys)
    random_pairs = {}
    for key in keys[:num_pairs]:
        random_pairs[key] = dictionary[key]
    return random_pairs

random_embeddings = pick_random_pairs(word_embeddings,100000)

In [6]:
print(len(random_embeddings.keys()))

100000


In [7]:
import numpy as np
from sklearn.preprocessing import normalize

def normalize_word_embeddings(word_embeddings):
    # Extract the word vectors and store them in a numpy array
    embeddings = np.array(list(word_embeddings.values()))

    # Normalize the word embeddings
    normalized_embeddings = normalize(embeddings)

    # Update the normalized embeddings back in the dictionary
    for i, word in enumerate(word_embeddings.keys()):
        word_embeddings[word] = normalized_embeddings[i]

    return word_embeddings

In [8]:
# Call the function to get normalized word embeddings
normalized_embeddings = normalize_word_embeddings(random_embeddings)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

'''def embeddings_to_cosine_sim_matrix(E):
    dot = E @ E.t()
    norm = torch.norm(E, 2, 1)
    x = torch.div(dot, norm)
    x = torch.div(x, torch.unsqueeze(norm, 0))
    return x

def embeddings_to_cosine_sim_matrix(E, batch_size=50000):
    num_embeddings = E.shape[0]
    num_batches = (num_embeddings - 1) // batch_size + 1

    cosine_sim_matrices = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, num_embeddings)

        batch_embeddings = E[start_idx:end_idx]
        dot = torch.matmul(batch_embeddings, E.t())
        norm = torch.norm(E, 2, 1)
        x = torch.div(dot, norm[:, None])
        x = torch.div(x, norm[start_idx:end_idx].view(-1, 1))
        cosine_sim_matrices.append(x)

    return torch.cat(cosine_sim_matrices)'''

def embeddings_to_cosine_sim_matrix(E):
    cosine_sim_matrix = cosine_similarity(E, E)
    return cosine_sim_matrix

In [11]:
import torch

In [12]:
word_embeddings_array = np.stack(list(normalized_embeddings.values()))

# Convert the NumPy array to a torch tensor
word_embeddings_tensor = torch.from_numpy(word_embeddings_array)

# Call the embeddings_to_cosine_sim_matrix function
cosine_sim_matrix = embeddings_to_cosine_sim_matrix(word_embeddings_tensor)

In [16]:
# if needed
# cosi = torch.load('./cosine')

In [13]:
# This is the modification based on the Paus paper. Essentially, I am finding the max
# number of boxes counted with each new box size where values are larger than 0 (because the
# vectors were normalized before computing the dot product) and smaller than the size of the
# area of the box. Once the max is found, the weights are added according to the paper where:
# <= 0 == 0
# >0 & <= max/3 == 1 * vector[index]
# > max/3 & <= max/2 == 2 * vector[index]
# > max/2 == 3 * vector[index]
#
#The result is then summed and the log of that sum is plotted against the log of 1 over the 
# size of the side of the box(1/sizes).
# Then, I used linear regression to calulate the r^2 value and plot the log - log coorelation, 
# and found the fractal dimension using the polyfit function

import numpy as np
import scipy
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

import statsmodels.formula.api as smf

def fractal_dimension_weighted(Z, threshold=0.9):
    
    def boxcount(Z, k):
        S = np.add.reduceat(
            np.add.reduceat(Z, np.arange(0, Z.shape[0], k), axis=0),
                               np.arange(0, Z.shape[1], k), axis=1)
        
        # Count non-empty (0) and non-full boxes (k*k)
        b_count = (np.where((S > 0) & (S < k*k))[0])
        
        #Find the maximum count of points between 0 and k^2
        b_max = np.max(b_count)

        
        #Add weights per the Czech paper
        for index in range(len(b_count)):
        
          if (b_count[index]) <= 0:
              b_count[index]=0
  
          elif b_count[index] <= np.floor(b_max/3) and b_count[index] > 0:
              b_count[index]=1*b_count[index] 
        
          elif b_count[index] <= np.floor(b_max/2) and b_count[index] > np.floor(b_max/3):
              b_count[index]=2*b_count[index] 
              
          elif b_count[index] > b_max/2:
              b_count[index]=3*b_count[index] 
              
        return sum(b_count) 
        
    # Transform Z utilizing the threshold
    Z = (Z < threshold)

    # Minimal dimension of matrix
    p = min(Z.shape)

    # Greatest power of 2 less than or equal to p
    n = 2**np.floor(np.log(p)/np.log(2))

    # Extract the exponent
    n = int(np.log(n)/np.log(2))

    # Build successive box sizes (from 2**n down to 2**1)
    sizes = 2**np.arange(n, 1, -1)

    # Actual box counting with decreasing size
    counts = []
    for size in sizes:
        counts.append(boxcount(Z, size))
    
    #Plot outputs
    plt.plot(np.log(counts), np.log(1/sizes), color = "g")
      
    plt.xlabel('log(s)')
    plt.ylabel('log(N(s))')
      
    plt.show()

    # Find r^2

    x = np.log(1/sizes)
    y = np.log(counts)
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    
    print("r-squared:", r_value**2)
   
    # Fit the successive log(sizes) with log (counts)
    coeffs = np.polyfit(np.log(1/sizes), np.log(counts), 1)
    return coeffs[0]

In [14]:
# This is the standard box counting method with no weights added and only counting the values
# fall between 0 and k^2, where k is the size of the side of the box.
def fractal_dimension(Z, threshold=0.9):
    
    def boxcount(Z, k):
        S = np.add.reduceat(
            np.add.reduceat(Z, np.arange(0, Z.shape[0], k), axis=0),
                               np.arange(0, Z.shape[1], k), axis=1)
        
        # Count non-empty (0) and non-full boxes (k*k)
        return sum(np.where((S > 0) & (S < k*k))[0])
        
        
    # Transform Z into a binary array
    Z = (Z < threshold)

    # Minimal dimension of matrix
    p = min(Z.shape)

    # Greatest power of 2 less than or equal to p
    n = 2**np.floor(np.log(p)/np.log(2))

    # Extract the exponent
    n = int(np.log(n)/np.log(2))

    # Build successive box sizes (from 2**n down to 2**1)
    sizes = 2**np.arange(n, 1, -1)

    # Actual box counting with decreasing size
    counts = []
    for size in sizes:
        counts.append(boxcount(Z, size))
    
    #My additions
    plt.plot(np.log(counts), np.log(1/sizes), color = "g")
      
    plt.xlabel('log(s)')
    plt.ylabel('log(N(s))')
      
    plt.show()

    x = np.log(1/sizes)
    y = np.log(counts)
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    
    print("r-squared:", r_value**2)
   
    # Fit the successive log(sizes) with log (counts)
    coeffs = np.polyfit(np.log(1/sizes), np.log(counts), 1)
    return coeffs[0]  

In [None]:
%%time
I = cosine_sim_matrix

print("Minkowski–Bouligand dimension (computed): ", fractal_dimension(I))

print("Minkowski–Bouligand dimension weighted(computed): ", fractal_dimension_weighted(I))