In [1]:
import os
import sys
import time
import pandas as pd
import numpy as np

In [2]:
#Load list of species (from ensembl txt file)
df_txt = pd.read_csv("species_EnsemblFungi.txt", sep='\t')
df_txt = df_txt.query("species == 'EnsemblFungi'").copy().reset_index()

df_txt = df_txt[[
    '#name',
    'assembly',
    'taxonomy_id',
    'other_alignments',
]].rename(columns={
    '#name' : 'Species_str',
    'taxonomy_id' : 'assembly',
    'assembly' : 'assembly_accession',
    'other_alignments' : 'Core_db',
})

print("len(df_txt) = " + str(len(df_txt)))

#Load list of species (from ensembl csv file)
df_csv = pd.read_csv("species.csv", sep=',')[['Name', 'Classification', 'Taxon ID', 'Assembly', 'Accession']]
df_csv['sort_order'] = np.arange(len(df_csv), dtype='int32')

print("len(df_csv) = " + str(len(df_csv)))

#Join lists to get url folder suffix
df = df_csv.join(df_txt.set_index('assembly_accession'), on='Accession', how='inner').sort_values(by='sort_order').copy().reset_index(drop=True)
df = df.drop(columns=['sort_order'])

print("len(df) = " + str(len(df)))


len(df_txt) = 1505
len(df_csv) = 1502
len(df) = 1501


In [4]:
#1. Saccharomyces cerevisiae R64 (reference)

save_suffix = '_r64'

df_sub = df.loc[df['Name'].str.contains("Saccharomyces cerevisiae") & df['Assembly'].str.contains("R64")].copy().reset_index(drop=True)

df_sub.to_csv("species" + save_suffix + ".csv", sep=',')
df_sub.to_csv("species" + save_suffix + "_gtf.csv", sep=',')

print("len(df_sub) = " + str(len(df_sub)))


len(df_sub) = 1


In [5]:
#2. Saccharomyces cerevisiae (all strains)

save_suffix = '_strains'

df_sub = df.loc[df['Name'].str.contains("Saccharomyces cerevisiae")].copy().reset_index(drop=True)

df_sub.to_csv("species" + save_suffix + ".csv", sep=',')

print("len(df_sub) = " + str(len(df_sub)))


len(df_sub) = 117


In [4]:
#3. Saccharomyces cerevisiae (s288c-like strains only)

save_suffix = '_strains_sm'

#Get list of S288c-like YJM strains (Duke university)
strain_df = pd.read_csv('yeast_yjm_strains.csv', sep=',')
strain_df = strain_df.query("S288c_like == 'S288c-like'").copy().reset_index(drop=True)
strain_df['strain_id'] = 'Sc_' + strain_df['strain_id'] + '_v1'

df_sub = df.loc[df['Name'].str.contains("Saccharomyces cerevisiae") & (df['Assembly'].isin(strain_df['strain_id'].values.tolist()) | (df['Assembly'] == 'R64-1-1'))].copy().reset_index(drop=True)

df_sub.to_csv("species" + save_suffix + ".csv", sep=',')
df_sub.to_csv("species" + save_suffix + "_gtf.csv", sep=',')

print("len(df_sub) = " + str(len(df_sub)))


len(df_sub) = 80


In [6]:
#4. Saccharomycetales (entire classification; no Saccharomyces)

save_suffix = '_saccharomycetales'

df_sub = df.loc[(df['Classification'] == "Saccharomycetales") & ((~df['Name'].str.contains("Saccharomyces")) | (df['Assembly'] == 'R64-1-1'))].copy().reset_index(drop=True)

df_sub.to_csv("species" + save_suffix + ".csv", sep=',')

print("len(df_sub) = " + str(len(df_sub)))


len(df_sub) = 167


In [7]:
#5. All fungal species (no Saccharomyces)

save_suffix = '_fungi'

df_sub = df.loc[(~df['Name'].str.contains("Saccharomyces")) | (df['Assembly'] == 'R64-1-1')].copy().reset_index(drop=True)

df_sub.to_csv("species" + save_suffix + ".csv", sep=',')

print("len(df_sub) = " + str(len(df_sub)))


len(df_sub) = 1380


In [8]:
#6. Small number of high-quality genomes (no Saccharomyces)

save_suffix = '_fungi'

acc_list = [
    'GCA_000146045.2', #Saccharomyces cerevisiae, R64-1-1
    'GCA_000002945.2', #Schizosaccharomyces pombe, ASM294v2
    'GCA_000182965.3', #Candida albicans, GCA000182965v3
    'GCA_000002545.2', #Candida glabrata, GCA000002545v2
    'GCA_002759435.2', #Candida auris, GCA002759435v2
    'GCA_000004155.2', #Schizosaccharomyces cryophilus, SCY4
    'GCA_000091025.3', #Ashbya gossypii, ASM9102v1
    'GCA_000002525.1'  #Yarrowia lipolytica, ASM252v1

# Candida auris	candida_auris	EnsemblFungi	498019	GCA002759435v2	GCA_002759435.2	2017-11-15	N	N	N	Y	N	Y	candida_auris_core_59_112_2	1	
# Schizosaccharomyces cryophilus	schizosaccharomyces_cryophilus	EnsemblFungi	653667	SCY4	GCA_000004155.2	2013-08-Broad	N	N	N	Y	Y	Y	schizosaccharomyces_cryophilus_core_59_112_2	1	
# Ashbya gossypii	ashbya_gossypii	EnsemblFungi	284811	ASM9102v1	GCA_000091025.3	2010-10-AGD	N	Y	N	Y	Y	Y	ashbya_gossypii_core_59_112_1	1	
# Yarrowia lipolytica	yarrowia_lipolytica	EnsemblFungi	284591	ASM252v1	GCA_000002525.1	2012-05-Genolevures	N	N	N	Y	N	Y	yarrowia_lipolytica_core_59_112_1	1	
    # 'GCA_000149925.1', #Puccinia graminis, ASM14992v1
    # 'GCA_000219625.1', #Zymoseptoria tritici, MG2
    # 'GCA_000011425.1', #Aspergillus nidulans, ASM1142v1
    # 'GCA_000002495.2', #Magnaporthe oryzae, MG8
    # 'GCA_000182925.2', #Neurospora crassa, NC12
    # 'GCA_000222805.1', #Fusarium oxysporum, FO2
]

df_sub = df.loc[df['Accession'].isin(acc_list)].copy().reset_index(drop=True)

df_sub.to_csv("species" + save_suffix + ".csv", sep=',')
df_sub.to_csv("species" + save_suffix + "_gtf.csv", sep=',')

print("len(df_sub) = " + str(len(df_sub)))


len(df_sub) = 10


In [20]:
# Fungi (reference)

save_suffix = '_fungi_1385'

# df_sub = df[~df['Name'].str.contains('Saccharomyces cerevisiae', case=False, na=False)]

df_sub = df[~((df['Name'].str.contains('Saccharomyces cerevisiae', case=False, na=False)) & (df['Assembly'] != 'R64-1-1'))]

df_sub.to_csv("species" + save_suffix + ".csv", sep=',')
df_sub.to_csv("species" + save_suffix + "_gtf.csv", sep=',')

print("len(df_sub) = " + str(len(df_sub)))


len(df_sub) = 1385
