In [2]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import random

In [3]:
dmeta = pd.read_csv("../../../metadata/sordariomycetes_metadata_v2_21032023.csv",sep=",",header=0)
print(dmeta.shape)
dmeta.head()

(605, 83)


Unnamed: 0,species_name,set,order,order2,filename,assembly_filename,code,augustus_species,assemblyID,strain,...,SMC,hybrid,indole,nrps,other,pks,terpene,assembly_noreps_dec,frac_repeats_dec,effectors
0,Aciculosporium take,ncbi2,Hypocreales,Hypocreales,Aciculosporium_take,Aciculosporium_take.fna,ATAKE,fusarium,AciTa_1.0,MAFF-241224,...,15.0,0.0,0.0,2.0,5.0,2.5,3.0,25822098,0.482322,103.0
1,Acremonium chrysogenum,ncbi2,Hypocreales,Hypocreales,Acremonium_chrysogenum,Acremonium_chrysogenum.fna,ACHRY,fusarium,ASM76926v1,ATCC 11550,...,42.0,2.0,1.0,7.0,3.0,14.0,9.0,25078611,0.115887,182.0
2,Akanthomyces lecanii,ncbi2,Hypocreales,Hypocreales,Akanthomyces_lecanii,Akanthomyces_lecanii.fna,ALECA,fusarium,LEL 1.0,RCEF 1005,...,35.0,2.0,1.0,14.0,7.0,5.0,4.0,32137302,0.094336,386.0
3,Albophoma yamanashiensis,ncbi2,Hypocreales,Hypocreales,Albophoma_yamanashiensis,Albophoma_yamanashiensis.fna,AYAMA,fusarium,JCM_11844_assembly_v001,JCM 11844,...,44.0,4.0,1.0,17.0,5.0,11.0,2.0,28161037,0.102075,269.0
4,Ambrosiella cleistominuta,ncbi2,Microascales,Microascales,Ambrosiella_cleistominuta,Ambrosiella_cleistominuta.fna,ACLEI,fusarium,ASM1713954v1,CBS 141682,...,10.0,0.0,0.0,2.0,2.0,1.5,3.0,24377077,0.092006,


# Filtering

Filtering out bad samples

In [6]:
fmeta = dmeta[(dmeta['buscoCS'] >= 85.0) & (dmeta['code'] != 'BGRAM') & (dmeta['subsetOphio'] == 1)].reset_index(drop=True)
print(fmeta.shape)

fmeta = fmeta[fmeta['assembly_length'] < 70000000].reset_index(drop=True)
print(fmeta.shape)

(563, 83)
(555, 83)


# Setting bins

In [16]:
bins = [0, 10000000, 20000000, 30000000, 40000000, 50000000, 60000000,
       70000000, 80000000, 120000000]
labels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-120']
fmeta['genome_bins'] = pd.cut(fmeta['assembly_length'], bins=bins, labels=labels)
fmeta.head()

fmeta.groupby('genome_bins')['code'].count()

genome_bins
0-10        0
10-20       0
20-30      69
30-40     240
40-50     166
50-60      56
60-70      24
70-80       0
80-120      0
Name: code, dtype: int64

# Setting outgroup aside (nonpatho = 7 & patho = 4)

In [17]:
foutgroup = fmeta.loc[fmeta['outgroup'] == 'yes',"code"].values.tolist()
print(len(foutgroup))
fmeta.loc[fmeta['outgroup'] == 'yes',"AnyPathogen"].value_counts()

11


AnyPathogen
0    7
1    4
Name: count, dtype: int64

In [18]:
ometa = fmeta[fmeta['outgroup'] == 'no'].reset_index(drop=True)
print(ometa.shape)

(544, 84)


Counting number of pathogenic and non-pathogenic species

In [19]:
print(ometa['AnyPathogen'].value_counts())
nonpathogens = ometa.loc[ometa['AnyPathogen'] == "0", "code"].values.tolist()
pathogens = ometa.loc[ometa['AnyPathogen'] == "1", "code"].values.tolist()

print(len(nonpathogens), len(pathogens))

AnyPathogen
1      348
0      192
NAN      4
Name: count, dtype: int64
192 348


# Subsampling

From each bin with values, take all species from the less common class, and subsample the same number of species from the other class

In [45]:
N = {}
gbins = ['20-30', '30-40', '40-50', '50-60', '60-70']
for gbin in gbins:
    print(gbin)
    
    # Count of patho and non-patho samples
    df_ = ometa.loc[ometa['genome_bins'] == gbin,['code', 'AnyPathogen']].reset_index(drop=True)
    codes_patho = df_.loc[df_['AnyPathogen'] == '1','code'].values.tolist()
    codes_nonpatho = df_.loc[df_['AnyPathogen'] == '0','code'].values.tolist()
    lower_class_count = min(len(codes_patho), len(codes_nonpatho))
    print(lower_class_count)

    # Subsampling with repeats
    B = []
    for boot in range(10):
        pathogens_subset = random.sample(codes_patho, lower_class_count)
        nonpathogens_subset = random.sample(codes_nonpatho, lower_class_count)
        B.append(pathogens_subset + nonpathogens_subset + foutgroup)
    N[gbin] = B

20-30
32
30-40
91
40-50
54
50-60
11
60-70
2


# Saving list with samples

In [47]:
for boot in range(10):
    B_SET = []
    for gbin in gbins:
        B_SET += N[gbin][boot]
        
    # List
    wh = open('getSubset_'+str(boot)+'.txt','w')
    for ele in B_SET:
        wh.write(ele+'\n')
    wh.flush()
    wh.close()

    # Table
    tab = dmeta[dmeta['code'].isin(B_SET)].reset_index(drop=True)
    tab.to_csv("getSubset_"+str(boot)+".csv",sep=",",header=True,index=False,quoting=csv.QUOTE_NONNUMERIC,quotechar='"')

In [43]:
len(B_SET)

380

In [50]:
tab[tab['outgroup'] == 'no'].groupby('AnyPathogen')['code'].count()

AnyPathogen
0    190
1    190
Name: code, dtype: int64