In [None]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting jedi>=0.16 (from ipython->ipython-autotime)
  Downloading jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl (7.0 kB)
Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.2 jedi-0.19.1
time: 459 µs (started: 2024-11-10 17:40:07 +00:00)


In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0
time: 17.5 s (started: 2024-11-10 17:40:07 +00:00)


In [None]:
import numpy as np
import requests
import io

# URL of the Simple English word vectors
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec"

print("Downloading word vectors...")
response = requests.get(url)
content = io.StringIO(response.content.decode('utf-8'))

word_embeddings = {}

num_words, vector_size = map(int, next(content).split())

for line in content:
    tokens = line.rstrip().split(' ')
    word = tokens[0]
    vector = np.array(tokens[1:], dtype=float)
    word_embeddings[word] = vector

print(f"Total number of words: {len(word_embeddings)}")
print(f"Dimension of each word vector: {vector_size}")

# Example: Print the first 5 words and their vector dimensions
for word in list(word_embeddings.keys())[:5]:
    print(f"{word}: {word_embeddings[word].shape}")

Downloading word vectors...
Total number of words: 111051
Dimension of each word vector: 300
</s>: (300,)
.: (300,)
,: (300,)
the: (300,)
of: (300,)
time: 19.5 s (started: 2024-11-10 17:40:24 +00:00)


In [None]:
#cluster analysis for each word, based on number of clusters considered for each word we have a different fractal value.


import numpy as np
import faiss
import matplotlib.pyplot as plt
import requests
import io
from collections import defaultdict


print(f"Total number of words: {len(word_embeddings)}")
print(f"Dimension of each word vector: {vector_size}")

# Create FAISS index
print("Creating FAISS index...")
index = faiss.IndexFlatL2(vector_size)
index.add(np.array(list(word_embeddings.values())))

def cluster_fractal_analysis(data, max_clusters, min_clusters=2):
    n_samples, n_features = data.shape
    data = data.astype(np.float32)

    results = []

    for k in range(min_clusters, max_clusters + 1):
        kmeans = faiss.Kmeans(d=n_features, k=k, niter=300, verbose=False)
        kmeans.train(data)
        _, labels = kmeans.index.search(data, 1)

        unique, counts = np.unique(labels, return_counts=True)
        cluster_sizes = counts[counts > 0]

        cluster_fractal_dims = np.log(cluster_sizes) / np.log(k)

        log_n = np.log(k)
        log_m = np.log(np.sum(cluster_sizes * np.log(cluster_sizes)))
        total_dimension = log_m / log_n if log_n != 0 else np.nan

        results.append({
            'num_clusters': k,
            'cluster_sizes': cluster_sizes,
            'cluster_fractal_dims': cluster_fractal_dims,
            'total_fractal_dim': total_dimension
        })

    return results

def cluster_fractal_dimension(data, max_clusters, min_clusters=2):
    n_samples, n_features = data.shape
    data = data.astype(np.float32)

    dimensions = []
    cluster_counts = range(min_clusters, max_clusters + 1)

    for k in cluster_counts:
        kmeans = faiss.Kmeans(d=n_features, k=k, niter=300, verbose=False)
        kmeans.train(data)
        _, labels = kmeans.index.search(data, 1)

        unique, counts = np.unique(labels, return_counts=True)
        cluster_sizes = counts[counts > 0]

        log_n = np.log(k)
        log_m = np.log(np.sum(cluster_sizes * np.log(cluster_sizes)))
        dimension = log_m / log_n if log_n != 0 else np.nan

        dimensions.append(dimension)

    return cluster_counts, dimensions

def progressive_fractal_dimension(data, max_clusters, min_clusters=2):
    cluster_counts, dimensions = cluster_fractal_dimension(data, max_clusters, min_clusters)
    return cluster_counts, dimensions

# List of words to analyze
words_to_analyze = [
    "Set", "Run", "Go", "Take", "Get", "Make", "Put", "Stand", "Hold", "Turn",
    "Break", "Fall", "Cut", "Pass", "Bring", "Do", "Catch", "Draw", "Leave", "Keep",
    "Pay", "Build", "Lead", "Raise", "Move", "Close", "Meet", "Reach", "Send", "Drive",
    "Throw", "Read", "Walk"
]

fractal_dimensions = defaultdict(list)

for word in words_to_analyze:
    if word.lower() in word_embeddings:
        print(f"\nAnalyzing word: {word}")

        # Find 10,000 closest words
        query_vector = word_embeddings[word.lower()].reshape(1, -1)
        _, indices = index.search(query_vector, 10000)

        closest_words_embeddings = np.array([list(word_embeddings.values())[i] for i in indices[0]])

        max_clusters = min(50, 10000 // 2)
        cluster_counts, progressive_dims = progressive_fractal_dimension(closest_words_embeddings, max_clusters)

        # Store fractal dimensions for each cluster count
        for count, dim in zip(cluster_counts, progressive_dims):
            fractal_dimensions[count].append(dim)

        # Plot results
        plt.figure(figsize=(12, 6))
        plt.plot(cluster_counts, progressive_dims, 'bo-')
        plt.xscale('log')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Estimated Fractal Dimension')
        plt.title(f'Progressive Fractal Dimension Analysis for "{word}"')
        plt.grid(True)

        final_dim = progressive_dims[-1]
        plt.axhline(y=final_dim, color='r', linestyle='--', label=f'Final Dimension: {final_dim:.2f}')
        plt.legend()

        plt.savefig(f'{word}_fractal_dimension.png')
        plt.close()

        results = cluster_fractal_analysis(closest_words_embeddings, max_clusters)

        # Print results
        for result in results:
            print(f"\nNumber of clusters: {result['num_clusters']}")
            print(f"Total fractal dimension: {result['total_fractal_dim']:.4f}")
    else:
        print(f"\nWord '{word}' not found in the embeddings.")

# Plot average fractal dimensions
plt.figure(figsize=(12, 6))
avg_dims = [np.mean(fractal_dimensions[count]) for count in sorted(fractal_dimensions.keys())]
plt.plot(sorted(fractal_dimensions.keys()), avg_dims, 'ro-')
plt.xscale('log')
plt.xlabel('Number of Clusters')
plt.ylabel('Average Estimated Fractal Dimension')
plt.title('Average Progressive Fractal Dimension Analysis')
plt.grid(True)

final_avg_dim = avg_dims[-1]
plt.axhline(y=final_avg_dim, color='b', linestyle='--', label=f'Final Average Dimension: {final_avg_dim:.2f}')
plt.legend()

plt.savefig('average_fractal_dimension.png')
plt.close()

Total number of words: 111051
Dimension of each word vector: 300
Creating FAISS index...

Analyzing word: Set

Number of clusters: 2
Total fractal dimension: 16.3781

Number of clusters: 3
Total fractal dimension: 10.2891

Number of clusters: 4
Total fractal dimension: 8.1302

Number of clusters: 5
Total fractal dimension: 6.9851

Number of clusters: 6
Total fractal dimension: 6.2603

Number of clusters: 7
Total fractal dimension: 5.7535

Number of clusters: 8
Total fractal dimension: 5.3763

Number of clusters: 9
Total fractal dimension: 5.0875

Number of clusters: 10
Total fractal dimension: 4.8706

Number of clusters: 11
Total fractal dimension: 4.6627

Number of clusters: 12
Total fractal dimension: 4.4849

Number of clusters: 13
Total fractal dimension: 4.3306

Number of clusters: 14
Total fractal dimension: 4.2286

Number of clusters: 15
Total fractal dimension: 4.0972

Number of clusters: 16
Total fractal dimension: 4.0128

Number of clusters: 17
Total fractal dimension: 3.9209


In [None]:
#Summarized fractal analysis for each word, fractal value calculated for each word in the list.

import numpy as np
import faiss
import matplotlib.pyplot as plt
import requests
import io

print(f"Total number of words: {len(word_embeddings)}")
print(f"Dimension of each word vector: {vector_size}")

# Create FAISS index
print("Creating FAISS index...")
index = faiss.IndexFlatL2(vector_size)
index.add(np.array(list(word_embeddings.values())))

def correlation_dimension_faiss(data, max_radius, num_radii=20, k=100):
    n_samples, n_features = data.shape
    data = data.astype(np.float32)

    radii = np.logspace(-1, np.log10(max_radius), num_radii)
    correlation_sum = np.zeros(num_radii)

    distances, _ = index.search(data, k)

    for i, r in enumerate(radii):
        count_within_radius = np.sum(distances < r**2, axis=1) - 1
        total_count = np.sum(count_within_radius)
        correlation_sum[i] = max(total_count / (n_samples * (n_samples - 1)), 1e-10)

    return radii, correlation_sum

def estimate_fractal_dimension(radii, correlation_sum):
    valid_indices = correlation_sum > 0
    log_radii = np.log(radii[valid_indices])
    log_correlation_sum = np.log(correlation_sum[valid_indices])

    if len(log_radii) < 2:
        return np.nan

    coeffs = np.polyfit(log_radii, log_correlation_sum, deg=1)
    return coeffs[0]

# List of words to analyze
words_to_analyze = [
    "Set", "Run", "Go", "Take", "Get", "Make", "Put", "Stand", "Hold", "Turn",
    "Break", "Fall", "Cut", "Pass", "Bring", "Do", "Catch", "Draw", "Leave", "Keep",
    "Pay", "Build", "Lead", "Raise", "Move", "Close", "Meet", "Reach", "Send", "Drive",
    "Throw", "Read", "Walk"
]

fractal_dimensions = {}

for word in words_to_analyze:
    if word.lower() in word_embeddings:
        print(f"\nAnalyzing word: {word}")

        # Find 10,000 closest words
        query_vector = word_embeddings[word.lower()].reshape(1, -1)
        _, indices = index.search(query_vector, 10000)

        closest_words_embeddings = np.array([list(word_embeddings.values())[i] for i in indices[0]])

        max_radius = np.sqrt(vector_size)  # Maximum possible distance in the feature space
        radii, correlation_sum = correlation_dimension_faiss(closest_words_embeddings, max_radius)

        fractal_dim = estimate_fractal_dimension(radii, correlation_sum)
        fractal_dimensions[word] = fractal_dim

        print(f"Estimated fractal dimension for '{word}': {fractal_dim:.4f}")

        # Plot results
        plt.figure(figsize=(10, 6))
        plt.loglog(radii, correlation_sum, 'bo-')
        plt.xlabel('Radius (r)')
        plt.ylabel('Correlation Sum C(r)')
        plt.title(f'Correlation Dimension Analysis for "{word}"')
        plt.grid(True)
        plt.savefig(f'{word}_correlation_dimension.png')
        plt.close()
    else:
        print(f"\nWord '{word}' not found in the embeddings.")


# Calculate and print average fractal dimension
avg_fractal_dim = np.mean(list(fractal_dimensions.values()))
print(f"\nAverage Fractal Dimension: {avg_fractal_dim:.4f}")

# Plot histogram of fractal dimensions
plt.figure(figsize=(10, 6))
plt.hist(list(fractal_dimensions.values()), bins=10, edgecolor='black')
plt.xlabel('Fractal Dimension')
plt.ylabel('Frequency')
plt.title('Distribution of Fractal Dimensions')
plt.axvline(avg_fractal_dim, color='r', linestyle='dashed', linewidth=2, label=f'Mean: {avg_fractal_dim:.4f}')
plt.legend()
plt.savefig('fractal_dimension_distribution.png')
plt.close()

Total number of words: 111051
Dimension of each word vector: 300
Creating FAISS index...

Analyzing word: Set
Estimated fractal dimension for 'Set': 4.8151

Analyzing word: Run
Estimated fractal dimension for 'Run': 4.8151

Analyzing word: Go
Estimated fractal dimension for 'Go': 4.8180

Analyzing word: Take
Estimated fractal dimension for 'Take': 4.8174

Analyzing word: Get
Estimated fractal dimension for 'Get': 4.8164

Analyzing word: Make
Estimated fractal dimension for 'Make': 4.8176

Analyzing word: Put
Estimated fractal dimension for 'Put': 4.8163

Analyzing word: Stand
Estimated fractal dimension for 'Stand': 4.8183

Analyzing word: Hold
Estimated fractal dimension for 'Hold': 4.8164

Analyzing word: Turn
Estimated fractal dimension for 'Turn': 4.8175

Analyzing word: Break
Estimated fractal dimension for 'Break': 4.8190

Analyzing word: Fall
Estimated fractal dimension for 'Fall': 4.8194

Analyzing word: Cut
Estimated fractal dimension for 'Cut': 4.8157

Analyzing word: Pass
Es

In [None]:
# List of words to analyze
words_to_analyze = ["mole", "curtsey", "burrow", "convect"
]

fractal_dimensions = {}

for word in words_to_analyze:
    if word.lower() in word_embeddings:
        print(f"\nAnalyzing word: {word}")

        # Find 10,000 closest words
        query_vector = word_embeddings[word.lower()].reshape(1, -1)
        _, indices = index.search(query_vector, 10000)

        closest_words_embeddings = np.array([list(word_embeddings.values())[i] for i in indices[0]])

        max_radius = np.sqrt(vector_size)  # Maximum possible distance in the feature space
        radii, correlation_sum = correlation_dimension_faiss(closest_words_embeddings, max_radius)

        fractal_dim = estimate_fractal_dimension(radii, correlation_sum)
        fractal_dimensions[word] = fractal_dim

        print(f"Estimated fractal dimension for '{word}': {fractal_dim:.4f}")

        # Plot results
        plt.figure(figsize=(10, 6))
        plt.loglog(radii, correlation_sum, 'bo-')
        plt.xlabel('Radius (r)')
        plt.ylabel('Correlation Sum C(r)')
        plt.title(f'Correlation Dimension Analysis for "{word}"')
        plt.grid(True)
        plt.savefig(f'{word}_correlation_dimension.png')
        plt.close()
    else:
        print(f"\nWord '{word}' not found in the embeddings.")


# Calculate and print average fractal dimension
avg_fractal_dim = np.mean(list(fractal_dimensions.values()))
print(f"\nAverage Fractal Dimension: {avg_fractal_dim:.4f}")

# Plot histogram of fractal dimensions
plt.figure(figsize=(10, 6))
plt.hist(list(fractal_dimensions.values()), bins=10, edgecolor='black')
plt.xlabel('Fractal Dimension')
plt.ylabel('Frequency')
plt.title('Distribution of Fractal Dimensions')
plt.axvline(avg_fractal_dim, color='r', linestyle='dashed', linewidth=2, label=f'Mean: {avg_fractal_dim:.4f}')
plt.legend()
plt.savefig('fractal_dimension_distribution2.png')
plt.close()


Analyzing word: mole
Estimated fractal dimension for 'mole': 4.8150

Analyzing word: curtsey
Estimated fractal dimension for 'curtsey': 4.8187

Analyzing word: burrow
Estimated fractal dimension for 'burrow': 4.8099

Word 'convect' not found in the embeddings.

Average Fractal Dimension: 4.8146
time: 2min 13s (started: 2024-11-10 21:00:42 +00:00)


In [None]:
%ls

average_fractal_dimension.png        Make_correlation_dimension.png
Break_correlation_dimension.png      Make_fractal_dimension.png
Break_fractal_dimension.png          Meet_correlation_dimension.png
Bring_correlation_dimension.png      Meet_fractal_dimension.png
Bring_fractal_dimension.png          mole_correlation_dimension.png
Build_correlation_dimension.png      Move_correlation_dimension.png
Build_fractal_dimension.png          Move_fractal_dimension.png
burrow_correlation_dimension.png     Pass_correlation_dimension.png
Catch_correlation_dimension.png      Pass_fractal_dimension.png
Catch_fractal_dimension.png          Pay_correlation_dimension.png
Close_correlation_dimension.png      Pay_fractal_dimension.png
Close_fractal_dimension.png          Put_correlation_dimension.png
curtsey_correlation_dimension.png    Put_fractal_dimension.png
Cut_correlation_dimension.png        Raise_correlation_dimension.png
Cut_fractal_dimension.png            Raise_fractal_dimension.png
Do_correla