In [1]:
%matplotlib inline

# Notes:

# Imports

In [2]:
from Bio import SearchIO
import pandas as pd
import numpy as np
import random
from scipy import stats

from sklearn.metrics import average_precision_score, confusion_matrix,\
                            precision_recall_curve, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from matplotlib import pyplot as plt

import joblib

# Constants

In [3]:
benchmark_file = '../../Data/model_data/phage_data_nmicro2017/processed_benchmark_set.csv'
hmm_results_dir = '../../Data/model_data/phage_data_nmicro2017/hmmsearch_out/'

# Initial data processing

In [4]:
###Read in the dataset and double check that analysis is limited to empirically defined data
df = pd.read_csv(benchmark_file, index_col=0)
print('Starting shape:', df.shape)
df = df[df['Temperate (empirical)'] != 'Unspecified']
print('New shape (should be identical):', df.shape)

###Add in my identifier
df['Identifier_AJH'] = ''  
df.at[df[df['Database source'] == 'NCBI RefSeq'].index, 'Identifier_AJH'] =\
                    df[df['Database source'] == 'NCBI RefSeq']['RefSeq accession number']
df.at[df[df['Database source'] == 'Actinobacteriophage_785'].index, 'Identifier_AJH'] =\
                    df[df['Database source'] == 'Actinobacteriophage_785']['Virus identifier used for the analysis'].str.split('_').str[0]
print('New shape (+1):', df.shape)

Starting shape: (1057, 22)
New shape (should be identical): (1057, 22)
New shape (+1): (1057, 23)


In [5]:
###Read through all of the hmmsearch files to accumulate a growing presence/absence dataframe
file_ending = '.hmmsearch.out'
growing_df = pd.DataFrame()
for index in df.index:
    if df.loc[index]['Database source'] == 'NCBI RefSeq':
        file_name = df.loc[index]['RefSeq accession number'] + file_ending
    elif df.loc[index]['Database source'] == 'Actinobacteriophage_785':
        file_name = df.loc[index]['Virus identifier used for the analysis'].split('_')[0].lower() + file_ending
#     print(file_name)
    try:
        with open(hmm_results_dir + file_name, 'r') as infile:
            results = list(SearchIO.parse(infile, 'hmmer3-text'))
            simple_res = []
            for i in results:
                if len(i.hits) > 0:
                    simple_res.append((i.id, 1))
                else:
                    simple_res.append((i.id, 0))
        single_df = pd.DataFrame(dict(simple_res), index=[index])
        growing_df = pd.concat([growing_df, single_df])
    except FileNotFoundError:
        pass
print(growing_df.shape)

###Add that to the main dataframe
full_df = df.join(growing_df)

(1057, 371)


In [6]:
###Split into training and testing sets
train_df, test_df = train_test_split(full_df, train_size=0.6,\
                                     random_state=42)
print('Shape of training and testing dataframes:', train_df.shape, test_df.shape)

###Set up the machine-learning training and test sets
ml_df_train = train_df[train_df.columns[23:]]
ml_df_test = test_df[test_df.columns[23:]]

###And labels
training_labels = pd.DataFrame(index=train_df.index)
training_labels['binary'] = 0
training_labels.at[train_df[train_df['Temperate (empirical)']=='yes'].index, 'binary'] = 1

testing_labels = pd.DataFrame(index=test_df.index)
testing_labels['binary'] = 0
testing_labels.at[test_df[test_df['Temperate (empirical)']=='yes'].index, 'binary'] = 1

print('Shape of training and testing labels:', training_labels.shape, testing_labels.shape)

Shape of training and testing dataframes: (634, 394) (423, 394)
Shape of training and testing labels: (634, 1) (423, 1)


# Drop features that are likely to be noise

My goal in semi-rationally selecting these protein domains was to *predict temperate phages*, ergo I don't even want a predictor of lytic phages in my dataset and given the choice of protein domains that I am including believe that these would likely be noise.

**Note that this step is reasonable if I *only* consider the training set in making this decision, as I am doing**

In [7]:
uninformative_cols = []
###Use training set to remove certain cases with too few hits (likely unreliable/not useful)
too_few_count = 2
transpose_df = ml_df_train.transpose()
uninformative_cols.extend(list(transpose_df[transpose_df.sum(axis=1)<=too_few_count].index))
print('Running count of uninformative columns:', len(uninformative_cols))
###Cases where lytic features are higher than or equal to temperate
lysog_df = ml_df_train[train_df['Temperate (empirical)']=='yes'].transpose()
lytic_df = ml_df_train[train_df['Temperate (empirical)']=='no'].transpose()
uninformative_cols.extend(list(transpose_df[lysog_df.sum(axis=1) <= (lytic_df.sum(axis=1))].index))
uninformative_cols = list(set(uninformative_cols))
print('Running count of uninformative columns:', len(uninformative_cols))

Running count of uninformative columns: 55
Running count of uninformative columns: 165


**Drop these columns from train, test sets**

In [8]:
ml_df_train = ml_df_train.drop(columns=uninformative_cols)
print('Training set shape', ml_df_train.shape)

ml_df_test = ml_df_test.drop(columns=uninformative_cols)
print('Testing set shape', ml_df_test.shape)

Training set shape (634, 206)
Testing set shape (423, 206)


**Write training and testing dataframes to a file**

In [9]:
train_df.drop(columns=uninformative_cols).to_csv('../../Data/train_df.csv')
test_df.drop(columns=uninformative_cols).to_csv('../../Data/test_df.csv')

# Machine learning of a model

1) Random forest 

2) Hyper-parameter optimization via training-validation set cross-validation
    1. normal k-fold cross-validation
        - where best is the highest mean amongst cross-val scores (standard approach)
    2. my more intense bootstrap sampling version
        - where best is the highest minimum amongst cross-val scores (my interpretation)

## Traditional cross validation

In [10]:
###Really simple random forest hyper-parameter sweep
scoring_fxn = 'f1'
n_fold_cv = 5
#
rf = RandomForestClassifier()
params_rf = {'bootstrap': [True, False],\
             'class_weight':['balanced', 'balanced_subsample'],\
             'min_samples_leaf': [1, 2],\
             'n_estimators': list(range(10, 105, 5)),\
             'max_depth': list(range(10, 42, 2))}

rf_gs = GridSearchCV(rf, params_rf, scoring=scoring_fxn, cv=n_fold_cv)

#Fit the model
rf_gs.fit(ml_df_train, training_labels['binary'])

#Select the best model (this selects the parameter set with the best mean score across cv splits)
rf_best = rf_gs.best_estimator_
print('Parameters of best model:', rf_gs.best_params_)

print('Cross validation scores using that model:',\
      cross_val_score(rf_gs.best_estimator_, ml_df_train, training_labels['binary'],\
                cv=n_fold_cv, scoring=scoring_fxn))

joblib.dump(rf_best, '../../Data/rf_best.joblib') 

Parameters of best model: {'bootstrap': False, 'class_weight': 'balanced_subsample', 'max_depth': 22, 'min_samples_leaf': 1, 'n_estimators': 20}
Cross validation scores using that model: [0.95714286 0.99310345 0.97222222 0.98611111 0.95833333]


['../../Data/rf_best.joblib']

In [11]:
print('Training set results:')
print(training_labels['binary'].value_counts()) ### 1 is for temperate, 0 for lytic
assert list(training_labels.index) == list(ml_df_train.index)
print()
print(confusion_matrix(training_labels['binary'], rf_best.predict(ml_df_train)))
print(accuracy_score(training_labels['binary'], rf_best.predict(ml_df_train)))

Training set results:
1    364
0    270
Name: binary, dtype: int64

[[270   0]
 [  1 363]]
0.998422712933754


In [12]:
print('Testing set results:')
print(testing_labels['binary'].value_counts()) ### 1 is for temperate, 0 for lytic
assert list(testing_labels.index) == list(ml_df_test.index)
print()
print(confusion_matrix(testing_labels['binary'], rf_best.predict(ml_df_test)))
print(accuracy_score(testing_labels['binary'], rf_best.predict(ml_df_test)))

Testing set results:
1    240
0    183
Name: binary, dtype: int64

[[181   2]
 [  6 234]]
0.9810874704491725


In [None]:
previous run/s said 1 error and 1,7 errors

## Improving cross-validation with pre-defined arrays of indices and repeated cross-validation

In [13]:
###Pre-defining n separate train/test splits
n_repetitions = 20
train_frac = 0.5/0.6 ###This essentially defines the training and validation set sizes

###This works by selecting iloc's
listy = list(range(0, ml_df_train.shape[0]))
train_lists = []
val_lists = []
for i in range(n_repetitions):
    random.Random(42+i).shuffle(listy)
    train_lists.append(listy[:int(len(listy)*train_frac)]) ###Select from beginning of list up to cut-point
    val_lists.append(listy[int(len(listy)*train_frac):]) ###Select cut-point onwards
###Zip these two together (and make sure I did it correctly)
cv_splits = list(zip(train_lists, val_lists))
print(len(cv_splits), len(cv_splits[0]), len(cv_splits[0][0]), len(cv_splits[0][1]))

20 2 528 106


In [14]:
###Perform grid search using all the same parameters as previosly defined
rf_AJH = RandomForestClassifier()
rf_gs_AJH = GridSearchCV(rf_AJH, params_rf, scoring=scoring_fxn, cv=cv_splits) ###Provide cv with list

#Fit the model
rf_gs_AJH.fit(ml_df_train.values, training_labels['binary'].values) ###Run on the values

###Find the model with the highest minimum accuracy across all n_repetitions of cross-validation
listy = list(zip(*[rf_gs_AJH.cv_results_['split{}_test_score'.format(i)] for i in range(n_repetitions)]))
print(len(listy), len(listy[0]))
listy = list(zip(*[rf_gs_AJH.cv_results_['params'], listy]))
print(len(listy), len(listy[0]))
listy = sorted(listy, key=lambda x: min(x[1]))
print(len(listy), len(listy[0]))
print(listy[0])
print(listy[-1])
best_params = listy[-1][0]
rf_min_AJH = RandomForestClassifier(**best_params)
rf_min_AJH.fit(ml_df_train, training_labels['binary'])

joblib.dump(rf_min_AJH, '../../Data/rf_highMinAJH.joblib'); 

2432 20
2432 2
2432 2
({'bootstrap': False, 'class_weight': 'balanced', 'max_depth': 14, 'min_samples_leaf': 2, 'n_estimators': 10}, (0.9256198347107438, 0.9606299212598426, 0.99009900990099, 0.9747899159663865, 0.9739130434782608, 0.975609756097561, 0.9843749999999999, 0.9821428571428572, 1.0, 0.9743589743589743, 0.9557522123893805, 0.9615384615384615, 0.9636363636363636, 0.9655172413793104, 0.9767441860465117, 0.970873786407767, 0.9908256880733944, 0.9836065573770492, 0.9752066115702479, 0.9649122807017544))
({'bootstrap': False, 'class_weight': 'balanced_subsample', 'max_depth': 40, 'min_samples_leaf': 1, 'n_estimators': 95}, (0.959349593495935, 0.9606299212598426, 0.99009900990099, 0.9747899159663865, 0.9739130434782608, 0.975609756097561, 0.9921259842519685, 0.9911504424778761, 1.0, 0.983050847457627, 0.9734513274336283, 0.9714285714285714, 0.972972972972973, 0.9743589743589743, 0.9846153846153847, 0.9807692307692307, 0.9908256880733944, 0.991869918699187, 0.9752066115702479, 0.96

In [17]:
print(training_labels['binary'].value_counts()) ### 1 is for temperate, 0 for lytic
assert list(training_labels.index) == list(ml_df_train.index)
print(confusion_matrix(training_labels['binary'].values, rf_min_AJH.predict(ml_df_train.values)))
print(accuracy_score(training_labels['binary'].values, rf_min_AJH.predict(ml_df_train.values)))

1    364
0    270
Name: binary, dtype: int64
[[270   0]
 [  1 363]]
0.998422712933754


In [18]:
print(testing_labels['binary'].value_counts()) ### 1 is for temperate, 0 for lytic
assert list(testing_labels.index) == list(ml_df_test.index)
print(confusion_matrix(testing_labels['binary'], rf_min_AJH.predict(ml_df_test)))
print(accuracy_score(testing_labels['binary'], rf_min_AJH.predict(ml_df_test)))

1    240
0    183
Name: binary, dtype: int64
[[182   1]
 [  7 233]]
0.9810874704491725


In [None]:
previous run said 1 wrong and 1/7 wrong

# Compare prediction errors on the training set to previous benchmark

In [None]:
rf_model = rf_best

predict_array = rf_model.predict(ml_df_train.values)
train_df['my_predictions'] = predict_array
train_df.at[train_df[train_df['my_predictions']==1].index, 'my_predictions_str']= 'yes'
train_df.at[train_df[train_df['my_predictions']==0].index, 'my_predictions_str']= 'no'

In [None]:
print('Previous wrong predictions:', train_df[train_df['Temperate (empirical)'] !=\
               train_df['Temperate (bioinformatically predicted)']].shape[0])


print(confusion_matrix(train_df['Temperate (empirical)'],\
                       train_df['Temperate (bioinformatically predicted)']))

print('Wrong predictions from my model:', train_df[train_df['Temperate (empirical)'] !=\
                                                   train_df['my_predictions_str']].shape[0])

print(confusion_matrix(train_df['Temperate (empirical)'],\
                 train_df['my_predictions_str']))

**Look at the error/s**

In [None]:
train_df[train_df['Temperate (empirical)'] !=train_df['my_predictions_str']]

**And now that we're fully finished model fitting, look at the test set predictions**

In [None]:
predict_array = rf_model.predict(ml_df_test.values)
test_df['my_predictions'] = predict_array
test_df.at[test_df[test_df['my_predictions']==1].index, 'my_predictions_str']= 'yes'
test_df.at[test_df[test_df['my_predictions']==0].index, 'my_predictions_str']= 'no'

print('Previous wrong predictions:', test_df[test_df['Temperate (empirical)'] !=\
               test_df['Temperate (bioinformatically predicted)']].shape[0])

print(confusion_matrix(test_df['Temperate (empirical)'],\
                 test_df['Temperate (bioinformatically predicted)']))

print('Wrong predictions from my model:', test_df[test_df['Temperate (empirical)'] !=\
                                                   test_df['my_predictions_str']].shape[0])

print(confusion_matrix(test_df['Temperate (empirical)'],\
                 test_df['my_predictions_str']))

**Look at the errors**

In [None]:
test_df[test_df['Temperate (empirical)'] !=test_df['my_predictions_str']]

# Investigate the source of errors

Are any lytic and temperate vectors actually identical?

In [None]:
all_lyt_vecs = []
train_lyt = ml_df_train.loc[training_labels[training_labels['binary']==0].index].values
test_lyt = ml_df_test.loc[testing_labels[testing_labels['binary']==0].index].values
lyt_vecs = np.concatenate((train_lyt, test_lyt))

train_temp = ml_df_train.loc[training_labels[training_labels['binary']==1].index].values
test_temp = ml_df_test.loc[testing_labels[testing_labels['binary']==1].index].values
temp_vecs = np.concatenate((train_temp, test_temp))
print('Number of total entries (lytic, temperate):', lyt_vecs.shape[0], temp_vecs.shape[0])

temp_set = set([tuple(i) for i in temp_vecs])
lyt_set = set([tuple(i) for i in lyt_vecs])
print('Number of unique vectors (lytic, temperate):', len(lyt_set), len(temp_set))

print('Set intersection:', len(temp_set.intersection(lyt_set)))

In [None]:
growing_df['pfam00665'].value_counts()

In [None]:
growing_df

# Feature analysis

In [None]:
fig, ax = plt.subplots()
ax.hist(rf_best.feature_importances_, 20, cumulative=True)

In [None]:
sum(rf_best.feature_importances_)

In [None]:
len([i for i in rf_best.feature_importances_ if i == 0])

In [None]:
zippy = list(zip(rf_best.feature_importances_, ml_df_train.columns))

In [None]:
for i,j in zippy:
    if i == 0:
        uninformative_cols.append(j)
uninformative_cols = list(set(uninformative_cols))
print(len(uninformative_cols))

In [None]:
domain_df = pd.read_csv('../Data/cdd/cddid_extension_2020_4_4.tsv', sep='\t', index_col=0)
domain_df

In [None]:
len(domain_df['1']), len(set(domain_df['1']))

In [None]:
bad_domain_df = domain_df[domain_df['1'].isin(uninformative_cols)]
good_domain_df = domain_df[domain_df['1'].isin(uninformative_cols)==False]
print(bad_domain_df.shape, good_domain_df.shape)

In [None]:
bad_domain_df.columns

In [None]:
test_col = 'temperate'
bad_domain_df[test_col].sum(), good_domain_df[test_col].sum()

In [None]:
test_col = 'integrase'
bad_domain_df[bad_domain_df[test_col]==1]['3']
# good_domain_df[good_domain_df[test_col]==1]['3']

In [None]:
best_cols = []
running_sum = 0
for i,j in zippy:
    if i > 0.0085:
        best_cols.append(j)
        running_sum += i
print(len(best_cols))
print(running_sum)

best_domain_df = domain_df[domain_df['1'].isin(best_cols)]
best_domain_df.shape

In [None]:
best_domain_df[(best_domain_df['search_hits']==1)]

In [None]:
cols_to_use = list(best_domain_df['1'])

In [None]:
best_feature_train_df = ml_df_train[cols_to_use]
best_feature_test_df = ml_df_test[cols_to_use]
best_feature_challenge_df = ml_df_challenge[cols_to_use]
print(best_feature_train_df.shape, best_feature_test_df.shape, best_feature_challenge_df.shape)

In [None]:
###Really simple random forest hyper-parameter sweep
# scoring_fxn = 'accuracy'
scoring_fxn = 'f1'
# scoring_fxn = 'average_precision'

n_fold_cv = 5

rf_truncate = RandomForestClassifier()
params_rf = {'bootstrap': [True, False],\
             'class_weight':['balanced', 'balanced_subsample'],\
             'n_estimators': list(range(2, 22, 2)),\
             'max_depth': list(range(6, 22, 2))}
rf_gs_truncate = GridSearchCV(rf_truncate, params_rf, scoring=scoring_fxn, cv=n_fold_cv)

#Fit the model
rf_gs_truncate.fit(best_feature_train_df, training_labels['binary'])

#Select the best model (this selects the best mean cv score and I'm skeptical that's the right choice)
rf_best_truncate = rf_gs_truncate.best_estimator_
print(rf_gs_truncate.best_params_)

print(cross_val_score(rf_gs_truncate.best_estimator_, best_feature_train_df, training_labels['binary'],\
                cv=n_fold_cv, scoring=scoring_fxn))

In [None]:
print(training_labels['binary'].value_counts()) ### 1 is for temperate, 0 for lytic
assert list(training_labels.index) == list(best_feature_train_df.index)
print(confusion_matrix(training_labels['binary'], rf_best_truncate.predict(best_feature_train_df)))

In [None]:
(334+258)/(334+258+6+21)

In [None]:
print(testing_labels['binary'].value_counts()) ### 1 is for temperate, 0 for lytic
assert list(testing_labels.index) == list(best_feature_test_df.index)
print(confusion_matrix(testing_labels['binary'], rf_best_truncate.predict(best_feature_test_df)))

In [None]:
print(challenge_labels['binary'].value_counts()) ### 1 is for temperate, 0 for lytic
assert list(challenge_labels.index) == list(best_feature_challenge_df.index)
print(confusion_matrix(challenge_labels['binary'], rf_best_truncate.predict(best_feature_challenge_df)))

predicted_useless_features 

In [None]:
file_to_test = '../Data/cdd/2020_4_1_useless_cdd.tsv'
feature_df = pd.read_csv(file_to_test, sep='\t', header=None, index_col=0)
feature_df

In [None]:
counts = 0
listy = list(feature_df[1])
for i in uninformative_cols:
    if i in listy:
        counts += 1
print(counts)
# len(set(uninformative_cols) - set(feature_df[1]))

In [None]:
listy = list(feature_df[feature_df[3].str.contains('lysogen', case=False)][1])
counts = 0
for i in listy:
    if i in uninformative_cols:
        counts += 1
print(counts/len(listy))

In [None]:
too_few_df[too_few_df.columns[18:22]]

In [None]:
chi, p, dof, ex = stats.chi2_contingency([[1, 630-1], [5, 421-5]])
print(p)

In [None]:
from statsmodels.stats import contingency_tables
print(contingency_tables.StratifiedTable([[[177, 1], [4, 239]], [[275, 0], [1, 354]]]).test_equal_odds().pvalue)

In [33]:
import json
with open('../../Data/model_data/clusters.json', 'r') as infile:
    clusters = json.load(infile)

In [34]:
full_df['name'] = ''  
full_df.at[full_df[full_df['Database source'] == 'NCBI RefSeq'].index, 'name'] =\
                    full_df[full_df['Database source'] == 'NCBI RefSeq']['RefSeq accession number']
full_df.at[full_df[full_df['Database source'] == 'Actinobacteriophage_785'].index, 'name'] =\
                    full_df[full_df['Database source'] == 'Actinobacteriophage_785']['Virus identifier used for the analysis'].str.split('_').str[0]

(634, 394)


In [82]:
temp_df = full_df.loc[train_df.index]
print(temp_df.shape)
temp_df_names = list(temp_df['name'])

independent_set = []
for cluster in clusters:
    hits = []
    for member in cluster:
        if member in temp_df_names:
            hits.append(member)
    if len(hits) == 0:
        independent_set.extend(cluster)

(634, 394)


In [83]:
temp_df = full_df.loc[test_df.index]
temp_df = temp_df[temp_df['name'].isin(independent_set)]
print(temp_df.shape)
indices = temp_df.index

(172, 394)


In [84]:
print(confusion_matrix(testing_labels.loc[indices]['binary'], rf_min_AJH.predict(ml_df_test.loc[indices])))
print(accuracy_score(testing_labels.loc[indices]['binary'], rf_min_AJH.predict(ml_df_test.loc[indices])))

[[72  1]
 [ 6 93]]
0.9593023255813954


In [85]:
temp_df = full_df.loc[indices]
temp_df[temp_df['Temperate (empirical)'] != temp_df['Temperate (bioinformatically predicted)']]

Unnamed: 0,Virus identifier used for the analysis,Database source,RefSeq header source description,RefSeq accession number,Genome type,Order,Family,Host domain,Host phylum,Host class,...,pfam18763,pfam18802,pfam18803,pfam18804,pfam18866,smart00470,smart00597,smart00614,smart00674,name
890,kbnp1711__nc_023593,NCBI RefSeq,Escherichia phage KBNP1711,NC_023593,dsDNA,Caudovirales,Podoviridae,Bacteria,Proteobacteria,Gammaproteobacteria,...,0,0,0,0,0,0,0,0,0,NC_023593
615,flagstaff__actino785,Actinobacteriophage_785,not applicable,not applicable,dsDNA,Caudovirales,Siphoviridae,Bacteria,Actinobacteria,Actinobacteria,...,0,1,1,0,0,0,0,0,0,flagstaff
109,a-4l__nc_024358,NCBI RefSeq,Anabaena phage A-4L,NC_024358,dsDNA,Caudovirales,Podoviridae,Bacteria,Cyanobacteria,Unspecified,...,0,0,1,0,0,0,0,0,0,NC_024358
418,cp-t1__nc_019457,NCBI RefSeq,Vibrio phage CP-T1 (vB_VchM-CP-T1),NC_019457,dsDNA,Caudovirales,Myoviridae,Bacteria,Proteobacteria,Gammaproteobacteria,...,0,0,0,0,0,0,0,0,0,NC_019457
1999,sumu__nc_019455,NCBI RefSeq,Haemophilus phage SuMu,NC_019455,dsDNA,Caudovirales,Myoviridae,Bacteria,Proteobacteria,Gammaproteobacteria,...,0,0,0,0,0,0,0,0,0,NC_019455
90,7201__nc_002185,NCBI RefSeq,Streptococcus phage 7201,NC_002185,dsDNA,Caudovirales,Siphoviridae,Bacteria,Firmicutes,Bacilli,...,0,0,0,0,0,0,0,0,0,NC_002185
1463,phieco32__nc_010324,NCBI RefSeq,Enterobacteria phage Phieco32,NC_010324,dsDNA,Caudovirales,Podoviridae,Bacteria,Proteobacteria,Gammaproteobacteria,...,0,0,0,0,0,0,0,0,0,NC_010324
1144,nj01__nc_018835,NCBI RefSeq,Enterobacteria phage NJ01,NC_018835,dsDNA,Caudovirales,Podoviridae,Bacteria,Proteobacteria,Gammaproteobacteria,...,0,0,0,0,0,0,0,0,0,NC_018835
1872,sfi19__nc_000871,NCBI RefSeq,Streptococcus phage Sfi19,NC_000871,dsDNA,Caudovirales,Siphoviridae,Bacteria,Firmicutes,Bacilli,...,0,0,0,0,0,0,0,0,0,NC_000871
1495,phikz__nc_004629,NCBI RefSeq,Pseudomonas phage phiKZ,NC_004629,dsDNA,Caudovirales,Myoviridae,Bacteria,Proteobacteria,Gammaproteobacteria,...,0,0,0,0,0,0,0,0,0,NC_004629


In [86]:
1-12/len(indices)

0.9302325581395349