```shell
get_homologues.pl -d {} -G -t 0 -c -n 1
```

In [1]:
%cd ..

/home/anton/pancluster


---

# BGC table creation process

In [3]:
import pandas as pd

In [15]:
clustering = pd.read_csv(
    "data/gcfs/network_files/2024-11-09_23-31-54_auto/mix/mix_clustering_c0.30.tsv",
    sep="\t"
)
clustering

Unnamed: 0,#BGC Name,Family Number
0,BGC0002188,1999
1,f6_hybrid_scf7180000000072.region002,1999
2,f6_spades_c00019_NODE_19...region001,1999
3,BGC0002709,2470
4,f6_hybrid_scf7180000000083.region001,2470
...,...,...
11707,f6_hybrid_scf7180000000078.region001,14185
11708,f6_spades_c00015_NODE_15...region001,14185
11709,f6_spades_c00021_NODE_21...region001,14192
11710,f6_hybrid_scf7180000000069.region003,14193


In [16]:
clustering = clustering[~clustering["#BGC Name"].str.startswith("BGC")].copy()

In [17]:
bgc_names = clustering["#BGC Name"].str.split("_")
organism = bgc_names.str.get(0) + "_" + bgc_names.str.get(1)
clustering["Organism"] = organism

In [20]:
clustering = clustering.drop(columns="#BGC Name")

In [21]:
clustering

Unnamed: 0,Family Number,Organism
1,1999,f6_hybrid
2,1999,f6_spades
4,2470,f6_hybrid
5,2470,f6_spades
6,2502,GCA_000149955.2
...,...,...
11707,14185,f6_hybrid
11708,14185,f6_spades
11709,14192,f6_spades
11710,14193,f6_hybrid


In [23]:
clustering = clustering.value_counts().reset_index()

In [24]:
clustering

Unnamed: 0,Family Number,Organism,count
0,2676,GCA_013347535.2,3
1,10210,GCA_001703255.1,2
2,5041,GCA_000222805.1,2
3,10210,GCA_001703345.1,2
4,10210,GCA_025216265.1,2
...,...,...,...
11646,6142,GCA_016166065.1,1
11647,6142,GCA_016166085.1,1
11648,6142,GCA_016166095.2,1
11649,6142,GCA_016166105.1,1


In [26]:
clustering = clustering.pivot(
    index="Organism", columns="Family Number", values="count"
).fillna(0)
clustering

Family Number,1999,2470,2502,2541,2542,2559,2587,2598,2676,2781,...,14155,14156,14162,14170,14179,14180,14184,14185,14192,14193
Organism,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_000149955.2,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_000222805.1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_000259975.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_000260175.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_000260215.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCA_038050555.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f11_hybrid,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f11_spades,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f6_hybrid,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0


In [40]:
metadata = pd.read_csv(
    "metadata.tsv", sep="\t",
    usecols=["Assembly Accession", "Organism Name"]
)
metadata

Unnamed: 0,Assembly Accession,Organism Name
0,GCA_001519035.1,Fusarium oxysporum f. sp. conglutinans
1,GCA_001519035.1,Fusarium oxysporum f. sp. conglutinans
2,GCA_001519035.1,Fusarium oxysporum f. sp. conglutinans
3,GCA_001519035.1,Fusarium oxysporum f. sp. conglutinans
4,GCA_001519035.1,Fusarium oxysporum f. sp. conglutinans
...,...,...
3653,GCA_038050555.1,Fusarium oxysporum f. sp. vasinfectum
3654,GCA_000260175.2,Fusarium oxysporum f. sp. vasinfectum 25433
3655,GCA_000260175.2,Fusarium oxysporum f. sp. vasinfectum 25433
3656,GCA_000260175.2,Fusarium oxysporum f. sp. vasinfectum 25433


In [41]:
metadata["Organism Name"] = metadata["Organism Name"].str.split().str.get(4)
metadata

Unnamed: 0,Assembly Accession,Organism Name
0,GCA_001519035.1,conglutinans
1,GCA_001519035.1,conglutinans
2,GCA_001519035.1,conglutinans
3,GCA_001519035.1,conglutinans
4,GCA_001519035.1,conglutinans
...,...,...
3653,GCA_038050555.1,vasinfectum
3654,GCA_000260175.2,vasinfectum
3655,GCA_000260175.2,vasinfectum
3656,GCA_000260175.2,vasinfectum


In [44]:
metadata = metadata.drop_duplicates().set_index("Assembly Accession")
metadata

Unnamed: 0_level_0,Organism Name
Assembly Accession,Unnamed: 1_level_1
GCA_001519035.1,conglutinans
GCA_002711385.1,conglutinans
GCA_002711405.2,conglutinans
GCA_014154955.1,conglutinans
GCA_018894095.1,conglutinans
...,...
GCA_030719095.1,vasinfectum
GCA_032878545.1,vasinfectum
GCA_032991405.1,vasinfectum
GCA_038050555.1,vasinfectum


In [50]:
gcf_table = pd.concat(
    [metadata, clustering], axis=1
).rename(columns={"Organism Name": "fsp"})
gcf_table

Unnamed: 0,fsp,1999,2470,2502,2541,2542,2559,2587,2598,2676,...,14155,14156,14162,14170,14179,14180,14184,14185,14192,14193
GCA_001519035.1,conglutinans,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_002711385.1,conglutinans,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_002711405.2,conglutinans,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_014154955.1,conglutinans,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_018894095.1,conglutinans,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCA_000260175.2,vasinfectum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f11_hybrid,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f11_spades,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f6_hybrid,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0


In [51]:
gcf_table.to_csv("data/tables/gcf-table.csv")

---

# Model creation