In [13]:
# CONVERT TXT EMBEDDINGS TO NUMPY ARRAYS

import numpy as np

def load_and_save_embeddings(file_path, output_file_name):
    embeddings = []
    
    with open(file_path, 'r') as f:
        for line in f:
            # Split the line into URL and embeddings
            split_line = line.strip().split(',')
            
            # The first element is the URL, and the rest are the embeddings
            url = split_line[0]
            embedding_values = split_line[1:]
            
            try:
                # Convert the rest of the line (after URL) into floats
                embedding = [float(value) for value in embedding_values]
                embeddings.append(embedding)
            except ValueError:
                print(f"Skipping invalid line: {line}")
                continue

    # Convert the list of embeddings into a NumPy array
    embeddings_np = np.array(embeddings)

    if embeddings_np.size == 0:
        print("No valid embeddings found.")
    else:
        print(f"Total embeddings processed: {embeddings_np.shape}")

    # Save the embeddings to a new .npy file
    np.save(output_file_name, embeddings_np)

    return embeddings_np

# Example usage
file_path = "genre_embeddings/one_useful_thing_substack_urls_embeddings.txt"  # Replace with your actual file path
output_file_name = "genre_embeddings/one_useful_thing_numpy_embeddings"  # Replace with your desired blog name

# Load embeddings, ignore URLs, and save as .npy
embeddings_np = load_and_save_embeddings(file_path, output_file_name)

# Verify by checking the shape of the saved embeddings
print(embeddings_np.shape)  # Should print the shape of the embeddings array


Total embeddings processed: (168, 1025)
(168, 1025)


In [14]:
# GET INFERRED TAGS THRU RELATED ESSAYS 

import numpy as np
from sklearn.neighbors import NearestNeighbors

# Load the embeddings
embeddings_np = np.load("genre_embeddings/one_useful_thing_numpy_embeddings.npy")  # Replace with your actual .npy file path

# Fit Nearest Neighbors model
num_neighbors = 5
neighbors_model = NearestNeighbors(n_neighbors=num_neighbors, metric='cosine')
neighbors_model.fit(embeddings_np)

# Propagate inferred tags based purely on neighbors (no pre-assigned tags)
def propagate_inferred_tags(embeddings_np):
    inferred_tags = {}
    
    for i in range(len(embeddings_np)):
        distances, indices = neighbors_model.kneighbors([embeddings_np[i]])
        
        # Infer tags based on neighbors' proximity (can be expanded with more logic later)
        similar_essays = indices[0][1:]  # Skip the essay itself (first neighbor is itself)
        
        # For now, let's assign a general "Related to Essays [X, Y, Z]" tag for similar essays
        inferred_tags[i] = f"Related to essays {list(similar_essays)}"
    
    return inferred_tags

# Get the inferred tags for all essays
inferred_tags = propagate_inferred_tags(embeddings_np)

# Output the results
for essay_idx, tag in inferred_tags.items():
    print(f"Essay {essay_idx}: Inferred Tag: {tag}")


Essay 0: Inferred Tag: Related to essays [1, 50, 32, 31]
Essay 1: Inferred Tag: Related to essays [32, 17, 50, 22]
Essay 2: Inferred Tag: Related to essays [20, 52, 140, 85]
Essay 3: Inferred Tag: Related to essays [83, 84, 107, 78]
Essay 4: Inferred Tag: Related to essays [98, 108, 143, 153]
Essay 5: Inferred Tag: Related to essays [46, 17, 45, 115]
Essay 6: Inferred Tag: Related to essays [85, 14, 23, 30]
Essay 7: Inferred Tag: Related to essays [30, 64, 46, 159]
Essay 8: Inferred Tag: Related to essays [9, 97, 37, 11]
Essay 9: Inferred Tag: Related to essays [11, 46, 38, 78]
Essay 10: Inferred Tag: Related to essays [11, 22, 46, 115]
Essay 11: Inferred Tag: Related to essays [9, 22, 38, 78]
Essay 12: Inferred Tag: Related to essays [13, 38, 9, 22]
Essay 13: Inferred Tag: Related to essays [9, 38, 74, 17]
Essay 14: Inferred Tag: Related to essays [15, 6, 1, 9]
Essay 15: Inferred Tag: Related to essays [62, 78, 64, 9]
Essay 16: Inferred Tag: Related to essays [79, 111, 17, 27]
Essay 1

In [15]:
# Example manual tags for 5 essays
manual_tags = {
    0: ["Claude 3.5 Sonnet", "inference compute", "AI scaling"],
    288: ["Biotech", "Innovation", "Healthcare"],
    289: ["Climate Change", "Policy", "Energy"],
    290: ["AI Fine-Tuning", "Tech Trends", "Innovation"],
    291: ["Space Exploration", "Astrophysics", "Engineering"]
}

# Print the manually assigned tags
for essay_idx, tags in manual_tags.items():
    print(f"Essay {essay_idx}: Tags - {', '.join(tags)}")

Essay 0: Tags - Claude 3.5 Sonnet, inference compute, AI scaling
Essay 288: Tags - Biotech, Innovation, Healthcare
Essay 289: Tags - Climate Change, Policy, Energy
Essay 290: Tags - AI Fine-Tuning, Tech Trends, Innovation
Essay 291: Tags - Space Exploration, Astrophysics, Engineering


In [16]:
import numpy as np

# Load the embeddings from the .npy file
embeddings_np = np.load("genre_embeddings/your_embeddings.npy")  # Replace with your actual .npy file path

# Function to load URLs from the .txt file (with both URLs and embeddings)
def load_urls(file_path):
    urls = []
    
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('http'):  # Only capture the URL lines
                url = line.strip()
                urls.append(url)
    
    return urls

# Load the URLs from the .txt file
urls = load_urls("your_urls_and_embeddings.txt")  # Replace with your actual .txt file path

# Ensure the number of embeddings matches the number of URLs
if len(embeddings_np) != len(urls):
    print("Warning: The number of embeddings does not match the number of URLs!")
else:
    print(f"Loaded {len(urls)} URLs and {len(embeddings_np)} embeddings.")

# Now, each embedding in the NumPy array corresponds to a URL from the .txt file
for i, url in enumerate(urls):
    print(f"Embedding {i} corresponds to URL: {url}")


FileNotFoundError: [Errno 2] No such file or directory: 'genre_embeddings/your_embeddings.npy'

In [12]:
# CONVERT TXT EMBEDDINGS TO NUMPY ARRAYS AND MAP TO URLS

import numpy as np

# Function to load URLs and embeddings from the .txt file
def load_urls_and_embeddings(file_path):
    urls = []
    embeddings = []

    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('http'):  # URL line
                current_url = line.strip()  # Capture the current URL
                urls.append(current_url)
            else:
                # This is an embedding line, so process it
                embedding_str = line.strip().split(',')
                embedding = np.array([float(val) for val in embedding_str])
                embeddings.append(embedding)

    return urls, np.array(embeddings)

# Function to save the embeddings to a .npy file and URLs to a .txt file
def save_embeddings_and_urls(embeddings, urls, npy_file_path, url_file_path):
    # Save the embeddings to a .npy file
    np.save(npy_file_path, embeddings)
    print(f"Embeddings saved to {npy_file_path}")

    # Save the URLs to a .txt file
    with open(url_file_path, 'w') as f:
        for url in urls:
            f.write(url + '\n')
    print(f"URLs saved to {url_file_path}")

# Main function to transform .txt into .npy with URL mapping
def process_txt_to_npy(txt_file_path, npy_file_path, url_file_path):
    # Load the URLs and embeddings from the .txt file
    urls, embeddings = load_urls_and_embeddings(txt_file_path)

    # Ensure the number of URLs matches the number of embeddings
    if len(embeddings) != len(urls):
        print("Warning: The number of embeddings does not match the number of URLs!")
    else:
        print(f"Loaded {len(urls)} URLs and {len(embeddings)} embeddings.")

    # Save the embeddings and URLs
    save_embeddings_and_urls(embeddings, urls, npy_file_path, url_file_path)

# Example usage
txt_file_path = "genre_embeddings/predictive_text_substack_urls_embeddings.txt"  # Replace with your actual .txt file path
npy_file_path = "genre_embeddings/predictive_text_numpy_embeddings.npy"  # Output file for NumPy array
url_file_path = "genre_embeddings/predictive_text_urls.txt"        # Output file for the URLs

# Run the process
process_txt_to_npy(txt_file_path, npy_file_path, url_file_path)


Embeddings saved to genre_embeddings/predictive_text_numpy_embeddings.npy
URLs saved to genre_embeddings/predictive_text_urls.txt


In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Load the embeddings from the .npy file
embeddings_np = np.load("genre_embeddings/predictive_text_numpy_embeddings.npy")  # Replace with your actual .npy file path

# Load the URLs from the .txt file
def load_urls(file_path):
    urls = []
    with open(file_path, 'r') as f:
        for line in f:
            urls.append(line.strip())  # Clean and add URL from the .txt file
    return urls

urls = load_urls("genre_embeddings/predictive_text_urls.txt")  # Replace with your actual .txt file path

# Fit the Nearest Neighbors model on the embeddings
num_neighbors = 5  # Adjust as needed
neighbors_model = NearestNeighbors(n_neighbors=num_neighbors, metric='cosine')
neighbors_model.fit(embeddings_np)

# Function to find similar essays for a given essay index and print the URLs
def find_similar_essays(essay_index, embeddings_np, urls):
    # Wrap the single embedding in double brackets to make it 2D
    distances, indices = neighbors_model.kneighbors([embeddings_np[essay_index]])  # Make 2D by adding []

    print(f"Top {num_neighbors} similar essays for essay at URL: {urls[essay_index]}")
    for i, idx in enumerate(indices[0]):
        if idx != essay_index:  # Skip the essay itself
            print(f"URL: {urls[idx]} with similarity score: {1 - distances[0][i]:.4f}")

# Example: Find similar essays for essay at index 0
find_similar_essays(0, embeddings_np, urls)

In [20]:
embeddings_np = np.load("genre_embeddings/predictive_text_numpy_embeddings.npy")
print(f"Embeddings shape: {embeddings_np.shape}")  # Should print the shape of the array, expecting (rows, columns)

Embeddings shape: (0,)


In [None]:
import numpy as np

# Function to load URLs and embeddings from the .txt file
def load_urls_and_embeddings(file_path):
    urls = []
    embeddings = []

    current_url = None  # Placeholder to track the current URL

    with open(file_path, 'r') as f:
        for line in f:
            # Debugging: print the line to see how it's structured
            print(f"Processing line: {line.strip()}")

            # Check if it's a URL line (starts with http or https)
            if line.startswith('http'):  # This will now capture both http and https
                current_url = line.strip()  # Capture the current URL
                print(f"Found URL: {current_url}")
            else:
                # This is an embedding line, process it only if a URL was captured before
                if current_url:
                    try:
                        embedding_str = line.strip().split(',')
                        embedding = np.array([float(val) for val in embedding_str])  # Convert each value to float
                        embeddings.append(embedding)
                        urls.append(current_url)  # Append the current URL for this chunk
                        print(f"Found embedding for URL {current_url}")
                    except ValueError:
                        print(f"Failed to parse embedding line: {line.strip()}")

    return urls, np.array(embeddings)

# Load the URLs and embeddings from the .txt file
txt_file_path = "genre_embeddings/predictive_text_substack_urls_embeddings.txt"  # Replace with your actual .txt file path
urls, embeddings = load_urls_and_embeddings(txt_file_path)

# Check if embeddings are properly loaded
print(f"Loaded {len(embeddings)} embeddings with shape {embeddings.shape}")
print(f"Loaded {len(urls)} URLs")

# Save the embeddings to a .npy file
npy_file_path = "genre_embeddings/predictive_text_numpy_embeddings.npy"  # Path to save NumPy array
np.save(npy_file_path, embeddings)
print(f"Embeddings saved to {npy_file_path}")

# Save the URLs to a separate .txt file (with repeated URLs if there are multiple embeddings per URL)
url_file_path = "genre_embeddings/predictive_text_numpy_urls.txt"  # Path to save URLs
with open(url_file_path, 'w') as f:
    for url in urls:
        f.write(url + '\n')
print(f"URLs saved to {url_file_path}")

In [None]:
import numpy as np

# Function to load URLs and embeddings from the .txt file
def load_urls_and_embeddings(file_path):
    urls = []
    embeddings = []
    current_url = None  # Placeholder to track the current URL

    with open(file_path, 'r') as f:
        for line in f:
            # Debugging: print the line to see how it's structured
            print(f"Processing line: {line.strip()}")

            # Check if it's a URL line (starts with http or https)
            if line.startswith('http'):  # This will now capture both http and https
                current_url = line.strip()  # Capture the current URL
                print(f"Found URL: {current_url}")
            else:
                # Attempt to process this line as an embedding line
                print(f"Processing potential embedding line: {line.strip()}")
                if current_url:
                    try:
                        embedding_str = line.strip().split(',')
                        embedding = np.array([float(val) for val in embedding_str])  # Convert each value to float
                        embeddings.append(embedding)
                        urls.append(current_url)  # Append the current URL for this chunk
                        print(f"Found embedding for URL {current_url}")
                    except ValueError:
                        print(f"Failed to parse embedding line: {line.strip()}")

    return urls, np.array(embeddings)

# Load the URLs and embeddings from the .txt file
txt_file_path = "genre_embeddings/predictive_text_substack_urls_embeddings.txt"  # Replace with your actual .txt file path
urls, embeddings = load_urls_and_embeddings(txt_file_path)

# Check if embeddings are properly loaded
print(f"Loaded {len(embeddings)} embeddings with shape {embeddings.shape}")
print(f"Loaded {len(urls)} URLs")

# Save the embeddings to a .npy file
npy_file_path = "genre_embeddings/predictive_text_numpy_embeddings.npy"  # Path to save NumPy array
np.save(npy_file_path, embeddings)
print(f"Embeddings saved to {npy_file_path}")

# Save the URLs to a separate .txt file (with repeated URLs if there are multiple embeddings per URL)
url_file_path = "genre_embeddings/predictive_text_numpy_urls.txt"  # Path to save URLs
with open(url_file_path, 'w') as f:
    for url in urls:
        f.write(url + '\n')
print(f"URLs saved to {url_file_path}")
