In [None]:
%matplotlib inline

# Imports

In [2]:
import joblib
import pandas as pd
import glob

import numpy as np
from matplotlib import pyplot as plt

# Plotting parameters

In [3]:
import matplotlib
###Default for jupyter display
matplotlib.rcParams['figure.dpi']= 150

matplotlib.rcParams['xtick.labelsize'] = 7
matplotlib.rcParams['ytick.labelsize'] = 7
matplotlib.rcParams['axes.labelsize'] = 7
matplotlib.rcParams['axes.titlesize'] = 7

matplotlib.rcParams['axes.grid'] = True
matplotlib.rcParams['grid.color'] = '0.8'
matplotlib.rcParams['grid.linewidth'] = '0.5'

matplotlib.rcParams['axes.edgecolor'] = '0.25'
matplotlib.rcParams['xtick.color'] = '0'
matplotlib.rcParams['ytick.color'] = '0'

matplotlib.rcParams['xtick.major.width'] = 1
matplotlib.rcParams['ytick.major.width'] = 1
matplotlib.rcParams['ytick.major.size'] = 5
matplotlib.rcParams['xtick.major.size'] = 5
matplotlib.rcParams['axes.spines.right'] = True
matplotlib.rcParams['axes.spines.left'] = True
matplotlib.rcParams['axes.spines.top'] = True
matplotlib.rcParams['axes.spines.bottom'] = True

matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'Helvetica'
matplotlib.rcParams['font.weight']='normal'
matplotlib.rcParams['axes.axisbelow'] = True
# matplotlib.rcParams['text.usetex'] = True

matplotlib.rcParams['legend.fontsize'] = 7
matplotlib.rcParams['legend.handlelength'] = 1
matplotlib.rcParams['legend.handleheight'] = 1
matplotlib.rcParams['legend.handletextpad'] = 0.4



prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

# Read in / process data

In [None]:
###Load classifier model
# clf = joblib.load('../Data/classifier_data/rf_best.joblib')
clf = joblib.load('../Data/classifier_data/rf_highMinAJH.joblib')

###Load datasets
train_df = pd.read_csv('../Data/classifier_data/train_df.csv', index_col=0)
print(train_df.shape)
test_df = pd.read_csv('../Data/classifier_data/test_df.csv', index_col=0)
print(test_df.shape)

**Integrate my predictions into the training/testing dataframes**

In [None]:
train_df['my_predictions'] = clf.predict(train_df[train_df.columns[23:]])
test_df['my_predictions'] = clf.predict(test_df[test_df.columns[23:]])

In [None]:
train_df['my_predictions'] = train_df['my_predictions'].replace(1, 'yes')
train_df['my_predictions'] = train_df['my_predictions'].replace(0, 'no')

test_df['my_predictions'] = test_df['my_predictions'].replace(1, 'yes')
test_df['my_predictions'] = test_df['my_predictions'].replace(0, 'no')

**Integrate PHACTS predictions into dataframe**

In [None]:
for df in [train_df, test_df]:
    listy = []
    for index in df.index[:]:
        name = df.loc[index]['Identifier_AJH']
        phacts_out = '../Data/phage_data_nmicro2017/PHACTS_results/{}_prodigal.out'.format(name)
        name2 = phacts_out.split('/')[-1].split('_prodigal.out')[0]
        assert name == name2
        phacts_df = pd.read_csv(phacts_out, sep='\t', skiprows=3, header=None)
        assert phacts_df.iloc[0][1] >= phacts_df.iloc[1][1]
        if phacts_df.iloc[0][1] == phacts_df.iloc[1][1]:
            listy.append('-')
            continue
        if phacts_df.iloc[0][0] == 'Temperate':
            listy.append('yes')
        else:
            listy.append('no')
    df['Temperate_PHACTS'] = listy

# Assess accuracies

In [None]:
###Training set
my_train_errors = train_df[train_df['Temperate (empirical)'] != 
                           train_df['my_predictions']].shape[0]

other_train_errors = train_df[train_df['Temperate (empirical)'] != 
                           train_df['Temperate (bioinformatically predicted)']].shape[0]

phacts_train_errors = train_df[train_df['Temperate (empirical)'] != 
                           train_df['Temperate_PHACTS']].shape[0]

print('Out of a total {}'.format(train_df.shape[0]))
print('My method had {}'.format(my_train_errors))
print('Other method had {}'.format(other_train_errors))
print('PHACTS method had {}'.format(phacts_train_errors))

my_train_errors = my_train_errors / train_df.shape[0] * 100
other_train_errors = other_train_errors / train_df.shape[0] * 100
phacts_train_errors = phacts_train_errors / train_df.shape[0] * 100

In [None]:
###Test set
my_test_errors = test_df[test_df['Temperate (empirical)'] != 
                           test_df['my_predictions']].shape[0]

other_test_errors = test_df[test_df['Temperate (empirical)'] != 
                           test_df['Temperate (bioinformatically predicted)']].shape[0]

phacts_test_errors = test_df[test_df['Temperate (empirical)'] != 
                           test_df['Temperate_PHACTS']].shape[0]

print('Out of a total {}'.format(test_df.shape[0]))
print('My method had {}'.format(my_test_errors))
print('Other method had {}'.format(other_test_errors))
print('PHACTS method had {}'.format(phacts_test_errors))

my_test_errors = my_test_errors / test_df.shape[0] * 100
other_test_errors = other_test_errors / test_df.shape[0] * 100
phacts_test_errors = phacts_test_errors / test_df.shape[0] * 100

In [None]:
1-(19/423)
# 1-(1/634)

# Further split the test set into easy/difficult sets

**According to some pre-calculated clusters of data, this splits the test set up specifically according to whether or not a related sequence (from the same cluster) was / was not included in the training set. The goal is to assess accuracy on an un-polluted (as much as possible) test set**

See `cluster_seqs.ipynb` for details

In [None]:
import json
with open('../Data/fastANI_output/clusters.json', 'r') as infile:
    clusters = json.load(infile)

In [None]:
temp_names = list(train_df['Identifier_AJH'])
independent_set = []
related_set = []
for cluster in clusters:
    hits = []
    for member in cluster:
        if member in temp_names:
            hits.append(member)
    if len(hits) == 0:
        independent_set.extend(cluster)
    else:
        related_set.extend(cluster)
###Get the challenging/easy sets        
independent_df = test_df[test_df['Identifier_AJH'].isin(independent_set)]
print(independent_df.shape)
related_df = test_df[test_df['Identifier_AJH'].isin(related_set)]
print(related_df.shape)

In [None]:
my_independent_errors = independent_df[independent_df['Temperate (empirical)'] != 
                           independent_df['my_predictions']].shape[0] /\
                            independent_df.shape[0] * 100.

other_independent_errors = independent_df[independent_df['Temperate (empirical)'] != 
                           independent_df['Temperate (bioinformatically predicted)']].shape[0] /\
                            independent_df.shape[0] * 100.

phacts_independent_errors = independent_df[independent_df['Temperate (empirical)'] != 
                           independent_df['Temperate_PHACTS']].shape[0] /\
                            independent_df.shape[0] * 100.



my_related_errors = related_df[related_df['Temperate (empirical)'] != 
                           related_df['my_predictions']].shape[0] /\
                            related_df.shape[0] * 100.

other_related_errors = related_df[related_df['Temperate (empirical)'] != 
                           related_df['Temperate (bioinformatically predicted)']].shape[0] /\
                            related_df.shape[0] * 100.

phacts_related_errors = related_df[related_df['Temperate (empirical)'] != 
                           related_df['Temperate_PHACTS']].shape[0] /\
                            related_df.shape[0] * 100.

In [None]:
100-phacts_independent_errors

# Make a final plot of the error rates

In [None]:
N = 4
my_errs = [my_train_errors, my_test_errors, my_related_errors, my_independent_errors][::-1]
other_errs = [other_train_errors, other_test_errors, other_related_errors, other_independent_errors][::-1]
phacts_errs = [phacts_train_errors, phacts_test_errors, phacts_related_errors, phacts_independent_errors][::-1]

fig, ax = plt.subplots(figsize=(2.6,2.4))
ind = np.arange(N)    # the x locations for the groups
width = 0.25         # the width of the bars
p1 = ax.barh(ind, phacts_errs, width)
p2 = ax.barh(ind+width, other_errs, width)
p3 = ax.barh(ind+width+width, my_errs, width)


ax.set_yticks(0.25 + np.arange(N))
ax.set_yticklabels(['Training set\n(n={})'.format(train_df.shape[0]),\
                    'Testing set\n(n={})'.format(test_df.shape[0]),\
                    'Testing set\n(related, n={})'.format(related_df.shape[0]),\
                    'Testing set\n(independent, n={})'.format(independent_df.shape[0])][::-1],\
                   ha='center', )
ax.yaxis.get_majorticklabels()[0].set_x(-0.2)
ax.yaxis.get_majorticklabels()[1].set_x(-0.2)
ax.yaxis.get_majorticklabels()[2].set_x(-0.2)
ax.yaxis.get_majorticklabels()[3].set_x(-0.2)



ax.legend((p3[0], p2[0], p1[0]), ('BACPHLIP', 'Mavrich', 'PHACTS'),\
          bbox_to_anchor=(0., 1.02, 1., .102), loc=3, mode='expand', ncol=3, borderaxespad=0.)

ax.set_xlabel('% Incorrect predictions');
plt.savefig('../Manuscript/figure.png', dpi=300, bbox_inches='tight')

# Look at the feature importances

In [None]:
fig, ax = plt.subplots(figsize=(4,3))
ax.hist(clf.feature_importances_, 20)

In [None]:
print('This should be 1:', sum(clf.feature_importances_))
print('How many features were useless:', len([i for i in clf.feature_importances_ if i == 0]))

In [None]:
domain_df = pd.read_csv('../Data/protein_domain_data/cddid_selected_2020_4_27.tsv', sep='\t', index_col=0)
print(domain_df.shape)
domain_df.head()

In [None]:
zippy = list(zip(clf.feature_importances_, train_df.columns[23:-2]))
print(zippy[:5])
non_zero_cols = [i[1] for i in zippy if i[0] != 0.0]
print(len(non_zero_cols))

In [None]:
bad_domain_df = domain_df[domain_df['1'].isin(non_zero_cols)==False]
good_domain_df = domain_df[domain_df['1'].isin(non_zero_cols)==True]
print(bad_domain_df.shape, good_domain_df.shape)

In [None]:
for test_col in domain_df.columns[4:]:
    print(test_col, bad_domain_df[test_col].sum(), good_domain_df[test_col].sum())

**Look at most important features**

In [None]:
zippy = list(zip(clf.feature_importances_, train_df.columns[23:-2]))
zippy = sorted(zippy, key=lambda x: x[0], reverse=True)
best_fams = [i[1] for i in zippy[:20]]
best_domain_df = domain_df[domain_df['1'].isin(best_fams)==True]
print(best_domain_df.shape)
best_domain_df


In [None]:
print('Those best 20 domains account for {} '
      'of the overall feature importance'.format(np.sum([i[0] for i in zippy[:20]])))

# Scratch

In [None]:
test_df

In [None]:
clf.predict_proba(test_df[test_df.columns[23:-2]])