In [1]:
import pandas as pd
import seaborn as sns

In [2]:
txt_file = '/home/ubuntu/wdir/PopGenStats_Project/data/longum_tree_sample_names.txt'

In [3]:
names = pd.read_csv(txt_file, sep=' ')
names['sample'] = names['sample'].str.replace('>', '', regex=False)
names

Unnamed: 0,sample
0,GCF_000196555.1_ASM19655v1_genomic.fna.ref
1,GCA_958434155.1_SRR22541675_bin.8_MetaWRAP_v1....
2,GCF_017743175.1_ASM1774317v1_genomic.fna
3,GCF_003465635.1_ASM346563v1_genomic.fna
4,GCF_015670335.1_ASM1567033v1_genomic.fna
...,...
645,GCF_003437135.1_ASM343713v1_genomic.fna
646,GCF_003466075.1_ASM346607v1_genomic.fna
647,GCF_004332645.1_ASM433264v1_genomic.fna
648,SPMP29_bin.81.fa


In [4]:
def categorize_sample(sample_name):
    if sample_name.startswith(('GCF', 'GCA')):
        return 'gtdb'
    elif sample_name.startswith('SPMP'):
        return 'SPMP'
    elif sample_name.startswith('flye.barcode'):
        return 'flye.barcode'
    elif sample_name.startswith('flye.SQK'):
        return 'flye.SQK'
    else:
        raise ValueError("Sample does not belong to a category")

names['group'] = names['sample'].apply(categorize_sample)
names

Unnamed: 0,sample,group
0,GCF_000196555.1_ASM19655v1_genomic.fna.ref,gtdb
1,GCA_958434155.1_SRR22541675_bin.8_MetaWRAP_v1....,gtdb
2,GCF_017743175.1_ASM1774317v1_genomic.fna,gtdb
3,GCF_003465635.1_ASM346563v1_genomic.fna,gtdb
4,GCF_015670335.1_ASM1567033v1_genomic.fna,gtdb
...,...,...
645,GCF_003437135.1_ASM343713v1_genomic.fna,gtdb
646,GCF_003466075.1_ASM346607v1_genomic.fna,gtdb
647,GCF_004332645.1_ASM433264v1_genomic.fna,gtdb
648,SPMP29_bin.81.fa,SPMP


In [5]:
groups = names['group'].unique()
palette = sns.color_palette(None, len(groups))

group_color_map = {cluster: f'rgba({color[0]*255:.2f},{color[1]*255:.2f},{color[2]*255:.2f},1.0)' 
                     for cluster, color in zip(groups, palette)}

names['colour'] = names['group'].map(group_color_map)
names

Unnamed: 0,sample,group,colour
0,GCF_000196555.1_ASM19655v1_genomic.fna.ref,gtdb,"rgba(31.00,119.00,180.00,1.0)"
1,GCA_958434155.1_SRR22541675_bin.8_MetaWRAP_v1....,gtdb,"rgba(31.00,119.00,180.00,1.0)"
2,GCF_017743175.1_ASM1774317v1_genomic.fna,gtdb,"rgba(31.00,119.00,180.00,1.0)"
3,GCF_003465635.1_ASM346563v1_genomic.fna,gtdb,"rgba(31.00,119.00,180.00,1.0)"
4,GCF_015670335.1_ASM1567033v1_genomic.fna,gtdb,"rgba(31.00,119.00,180.00,1.0)"
...,...,...,...
645,GCF_003437135.1_ASM343713v1_genomic.fna,gtdb,"rgba(31.00,119.00,180.00,1.0)"
646,GCF_003466075.1_ASM346607v1_genomic.fna,gtdb,"rgba(31.00,119.00,180.00,1.0)"
647,GCF_004332645.1_ASM433264v1_genomic.fna,gtdb,"rgba(31.00,119.00,180.00,1.0)"
648,SPMP29_bin.81.fa,SPMP,"rgba(255.00,127.00,14.00,1.0)"


In [6]:
with open('/home/ubuntu/wdir/PopGenStats_Project/data/B_longum/tree/iqtree.parsnp.renamed.treefile.annot.sample_labels.txt', 'w') as f:
    f.write("\n")
    f.write("        DATASET_COLORSTRIP\n\n")
    f.write("        SEPARATOR SPACE\n")
    f.write("        DATASET_LABEL sample_labels\n")
    f.write("        COLOR #ff0000\n\n")
    f.write("        COLOR_BRANCHES 0\n\n")
    f.write("        DATA\n\n")
    
    for _, row in names.iterrows():
        genome_id = row['sample']
        colour = row['colour']
        group = row['group']
        f.write(f"{genome_id} {colour} {group}\n")