In [2]:
import pandas as pd
import glob
import os
import json

def analyze_latest_debug_file():
    # 1. Find the latest CSV file in debug_data/
    list_of_files = glob.glob('debug_data/*.csv')
    if not list_of_files:
        print("‚ùå No CSV files found in 'debug_data/' directory.")
        return

    latest_file = max(list_of_files, key=os.path.getctime)
    print(f"üìÇ Analyzing file: {latest_file}")
    
    # 2. Load into Pandas
    try:
        df = pd.read_csv(latest_file)
    except Exception as e:
        print(f"‚ùå Failed to read CSV: {e}")
        return

    # 3. Analyze Vector Lengths (Should be exactly 768)
    print("\n--- 1. Vector Dimensions ---")
    unique_lengths = df['vector_length'].unique()
    print(f"Unique Vector Lengths: {unique_lengths}")
    if len(unique_lengths) > 1 or unique_lengths[0] != 768:
        print("‚ö†Ô∏è  WARNING: Found inconsistent or incorrect vector dimensions!")
    else:
        print("‚úÖ Dimensions look correct.")

    # 4. Analyze Payload Size (The most likely culprit)
    print("\n--- 2. Payload Size (Bytes) ---")
    max_size = df['payload_size_approx'].max()
    avg_size = df['payload_size_approx'].mean()
    
    print(f"Max Payload Size: {max_size:,.0f} bytes ({max_size/1024/1024:.2f} MB)")
    print(f"Avg Payload Size: {avg_size:,.0f} bytes")
    
    # gRPC default limit is often 4MB (4,194,304 bytes)
    GRPC_LIMIT = 4 * 1024 * 1024 
    if max_size > GRPC_LIMIT:
        print(f"‚ùå CRITICAL: Max payload ({max_size/1024/1024:.2f} MB) exceeds standard gRPC limit (4MB)!")
        print("   -> SOLUTION: You must strip the 'content' field from the Qdrant payload.")
        
        # Show the biggest offender
        largest_row = df.loc[df['payload_size_approx'].idxmax()]
        print(f"   -> Biggest Article: {largest_row['link']}")
    else:
        print("‚úÖ Payload sizes are within safe limits.")

    # 5. Check for NaNs in vectors
    print("\n--- 3. Vector Integrity ---")
    # Parse the full JSON vector string to check individual values if needed
    # (Just checking simple string content for now)
    nan_vectors = df[df['full_vector_json'].astype(str).str.contains("NaN|Infinity", case=False)]
    
    if not nan_vectors.empty:
        print(f"‚ùå CRITICAL: Found {len(nan_vectors)} rows with NaN or Infinity in vectors!")
        print(nan_vectors[['point_id', 'link']])
    else:
        print("‚úÖ No NaNs or Infinity values detected in vector strings.")

if __name__ == "__main__":
    analyze_latest_debug_file()

üìÇ Analyzing file: debug_data/batch_20251222_233947_659ccc.csv

--- 1. Vector Dimensions ---
Unique Vector Lengths: [768]
‚úÖ Dimensions look correct.

--- 2. Payload Size (Bytes) ---
Max Payload Size: 78,612 bytes (0.07 MB)
Avg Payload Size: 11,158 bytes
‚úÖ Payload sizes are within safe limits.

--- 3. Vector Integrity ---
‚úÖ No NaNs or Infinity values detected in vector strings.


In [3]:
import pandas as pd
import glob
import os

# 1. Find the latest CSV file in 'debug_data/'
list_of_files = glob.glob('debug_data/*.csv')

if not list_of_files:
    print("‚ùå No CSV files found in 'debug_data/' directory.")
else:
    # Get the most recently created file
    latest_file = max(list_of_files, key=os.path.getctime)
    print(f"üìÇ Reading file: {latest_file}\n")

    # 2. Load into DataFrame
    df = pd.read_csv(latest_file)

    # 3. Show Info (Column names & types)
    print("--- DataFrame Info ---")
    print(df.info())
    print("\n" + "="*80 + "\n")

    # 4. Show Head (First 5 rows)
    print("--- First 5 Rows (Head) ---")
    pd.set_option('display.max_columns', None)  # Ensure all columns are visible
    pd.set_option('display.width', 1000)        # Prevent wrapping
    print(df.head())

üìÇ Reading file: debug_data/batch_20251222_233947_659ccc.csv

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   point_id             300 non-null    object
 1   link                 300 non-null    object
 2   vector_length        300 non-null    int64 
 3   vector_sample_start  300 non-null    object
 4   payload_size_approx  300 non-null    int64 
 5   full_vector_json     300 non-null    object
dtypes: int64(2), object(4)
memory usage: 14.2+ KB
None


--- First 5 Rows (Head) ---
                               point_id                                               link  vector_length                                vector_sample_start  payload_size_approx                                   full_vector_json
0  ea194bf9-047b-588d-9093-8aa1317a95c1  https://www.isna.ir/news/1403090905889/€åÿßÿØ-ÿ®ÿπÿ∂...      

In [None]:
 IRNA            | completed | 121460
 ISNA            | completed | 225201
 Shargh          | completed | 109371