In [1]:
import pandas as pd
from itertools import combinations
from collections import Counter

print("Starting Jaccard Co-Occurrence Calculation (Full Dataset)...")

# --- 1. FILE PATHS & LOADING ---
file_metalog = "../data/metalog_bgcs_with_gcf_and_tax.tsv"
file_mgnify = "../data/mgnify_bgcs_with_gcf_and_tax.tsv"
cols_to_use = ['analysis_accession', 'gcf_id'] 

try:
    # Use r'\s+' as separator to match your successful GLASSO run
    print("Loading Metalog dataset...")
    df1 = pd.read_csv(file_metalog, sep=r'\s+', usecols=cols_to_use)
    print("Loading MGnify dataset...")
    df2 = pd.read_csv(file_mgnify, sep=r'\s+', usecols=cols_to_use)
    
    # Combine dataframes
    df = pd.concat([df1, df2], ignore_index=True)
    print(f"Datasets combined. Total rows: {len(df)}")

except FileNotFoundError:
    print("ERROR: Files not found!")
    raise

# --- 2. STANDARDIZED CLEANING (Identical to GLASSO) ---

# A. Remove NaNs
df = df.dropna(subset=['gcf_id'])

# B. Normalize GCF IDs (Removing the .0 float suffix is critical!)
df['gcf_id'] = df['gcf_id'].astype(str).str.replace(r'\.0$', '', regex=True)

# C. Filter Noise using the standardized list
noise_list = ["-1", "nan", "None", "", "unknown"]
df = df[~df["gcf_id"].isin(noise_list)]

print(f"Cleaned data: {len(df)} rows remaining.")
print(f"Unique GCFs identified: {df['gcf_id'].nunique()}") # Should be 73,520 now
print(f"Total unique samples: {df['analysis_accession'].nunique()}") # Should be 61,622 now

# --- 3. CALCULATE FREQUENCIES ---
# We count in how many unique samples each GCF appears
gcf_freq_dict = df.groupby("gcf_id")["analysis_accession"].nunique().to_dict()
print(f"Individual frequencies calculated for {len(gcf_freq_dict)} GCFs.")

Starting Jaccard Co-Occurrence Calculation (Full Dataset)...
Loading Metalog dataset...
Loading MGnify dataset...
Datasets combined. Total rows: 9741492
Cleaned data: 9741406 rows remaining.
Unique GCFs identified: 73520
Total unique samples: 62378
Individual frequencies calculated for 73520 GCFs.


In [2]:
# CALCULATE CO-OCCURRENCE
# Group by the SAMPLE column ('analysis_accession') to get the set of GCFs per sample
gcfs_per_sample = df.groupby("analysis_accession")["gcf_id"].apply(set)

print("Counting pairs (this might take a moment)...")
pair_counts = Counter()

# Iterate through each sample and count pairs
for gcfs in gcfs_per_sample:
    for g1, g2 in combinations(sorted(list(gcfs)), 2):
        pair = (g1, g2)
        pair_counts[pair] += 1

print(f"Finished! Found {len(pair_counts)} unique pairs.")

# --- 4. OPTIMIZED FILTER, MAP & SAVE ---

print(f"Total pairs found (Raw): {len(pair_counts)}")

MIN_COOCCURRENCE = 20 
print(f"Filtering pairs with count >= {MIN_COOCCURRENCE} (Directly) and mapping frequencies...")

# Wir filtern DIREKT beim Erstellen der Liste UND fügen die Frequenzen hinzu
filtered_data = [
    {
        "gcf1": g1, 
        "gcf2": g2, 
        "count": count,
        # NEU: Mapping der Frequenzen aus dem korrekten Dictionary
        "freq_gcf1": gcf_freq_dict.get(g1, 0),
        "freq_gcf2": gcf_freq_dict.get(g2, 0)
    } 
    for (g1, g2), count in pair_counts.items()
    if count >= MIN_COOCCURRENCE 
]

print(f"Pairs remaining: {len(filtered_data)}")

# DataFrame erstellen (es ist jetzt winzig und blitzschnell!)
df_coocc = pd.DataFrame(filtered_data)

# Sortieren
if not df_coocc.empty:
    print("Sorting and saving...")
    # Sortieren nach gcf1 und gcf2
    df_coocc_sorted = df_coocc.sort_values(by=["gcf1", "gcf2"]).reset_index(drop=True)

    # Speichern (enthält jetzt die Spalten count, freq_gcf1 und freq_gcf2)
    output_file = "../data/coocc_gcfs_counts.csv"
    df_coocc_sorted.to_csv(output_file, index=False)
    print(f"Filtered co-occurrence list saved to: {output_file}")
    
    display(df_coocc_sorted.head(10))
else:
    print("WARNUNG: Keine Paare übrig geblieben! Ist der Filter zu streng?")

Counting pairs (this might take a moment)...
Finished! Found 68595539 unique pairs.
Total pairs found (Raw): 68595539
Filtering pairs with count >= 20 (Directly) and mapping frequencies...
Pairs remaining: 4660923
Sorting and saving...
Filtered co-occurrence list saved to: ../data/coocc_gcfs_counts.csv


Unnamed: 0,gcf1,gcf2,count,freq_gcf1,freq_gcf2
0,GCF_0001802a3e,GCF_000a3131e5,33,133,2453
1,GCF_0001802a3e,GCF_007735497f,87,133,680
2,GCF_0001802a3e,GCF_008315ab33,22,133,65
3,GCF_0001802a3e,GCF_00d9b2e84a,23,133,96
4,GCF_0001802a3e,GCF_01b54c94cf,30,133,78
5,GCF_0001802a3e,GCF_01d561c0d0,93,133,1270
6,GCF_0001802a3e,GCF_02745c1542,45,133,3310
7,GCF_0001802a3e,GCF_03c148dd83,26,133,2559
8,GCF_0001802a3e,GCF_03e5bf380f,32,133,179
9,GCF_0001802a3e,GCF_04091ae23b,26,133,175


In [3]:
# FILTER FOR VISUALIZATION (Top-N Nodes)

print("Filtering edges for visualization preview...")

# 1. Determine Top-N nodes based on appearance in edges
# Flatten the list of all GCFs involved in edges
all_gcfs = df_coocc_sorted["gcf1"].tolist() + df_coocc_sorted["gcf2"].tolist()
gcf_counts = Counter(all_gcfs)

# Select the Top 50 most frequent GCFs
top_n = 50
top_gcfs = [gcf for gcf, _ in gcf_counts.most_common(top_n)]
print(f"Identified Top {top_n} most connected GCFs.")

# 2. Filter edges (Keep edge if AT LEAST ONE node is in Top-N)
df_top_nodes = df_coocc_sorted[
    df_coocc_sorted["gcf1"].isin(top_gcfs) | df_coocc_sorted["gcf2"].isin(top_gcfs)
]

# 3. Limit neighbors per Top-N node (to avoid hairballs)
top_k = 10
edges_list = []

for gcf in top_gcfs:
    # Find all edges connected to this top node
    gcf_edges = df_top_nodes[(df_top_nodes["gcf1"] == gcf) | (df_top_nodes["gcf2"] == gcf)]
    # Keep only the top K strongest edges for this node
    edges_list.append(gcf_edges.sort_values(by="count", ascending=False).head(top_k))

# Combine and remove duplicates
df_top_edges_limited = pd.concat(edges_list).drop_duplicates().reset_index(drop=True)

# 4. Save the limited file
output_limited = "../data/coocc_gcfs_top_edges_limited.csv"
df_top_edges_limited.to_csv(output_limited, index=False)

print(f"Limited edge list saved to: {output_limited}")
display(df_top_edges_limited.head())


Filtering edges for visualization preview...
Identified Top 50 most connected GCFs.
Limited edge list saved to: ../data/coocc_gcfs_top_edges_limited.csv


Unnamed: 0,gcf1,gcf2,count,freq_gcf1,freq_gcf2
0,GCF_3d3b9d6d75,GCF_af8dca7f7e,24901,35886,31830
1,GCF_af8dca7f7e,GCF_c007ec0dcc,21360,31830,28318
2,GCF_af8dca7f7e,GCF_b9481eeef2,19899,31830,26179
3,GCF_af8dca7f7e,GCF_c5715ad659,18105,31830,21341
4,GCF_af8dca7f7e,GCF_bcc44c660b,17148,31830,22910
