In [1]:
%matplotlib inline

In [2]:
import joblib
import bacphlip
import glob
import pandas as pd
import numpy as np

# Purpose

`scikit-learn` has an (astonishing) lack of object permanency between versions. Thus, to keep `bacphlip` up-to-date this code will need to be run periodically. It will:

1. import the classifier into the new version of `scikit-learn` (which I'll need to keep updated)
2. classify previously computed dataframes 
3. compare those results to previously computed results

In [3]:
clf = joblib.load('../Data/classifier_data/rf_version_updating.joblib')

In [6]:
prev_input = glob.glob('../Data/bacphlip_test/*.fasta.hmmsearch.tsv')
prev_output = [i.replace('.fasta.hmmsearch.tsv', '.fasta.bacphlip') for i in prev_input]

In [7]:
for infile, outfile in list(zip(prev_input, prev_output))[:100]:
    single_input = pd.read_csv(infile, sep='\t', index_col=0)
    class_probs = clf.predict_proba(single_input)
    
    single_output = pd.read_csv(outfile, sep='\t')
    assert np.isclose(class_probs, single_output.values).sum() == 2
    
joblib.dump(clf, '../Data/classifier_data/rf_highMinAJH_newsklearn.joblib') 

['../Data/classifier_data/rf_highMinAJH_newsklearn.joblib']

**Finally, be sure to copy this new classifier over to the `bacphlip` directory**

In [3]:
import sklearn

In [4]:
sklearn.__version__

'0.23.1'

In [5]:
clf.get_params()

{'bootstrap': False,
 'ccp_alpha': None,
 'class_weight': 'balanced_subsample',
 'criterion': 'gini',
 'max_depth': 40,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 80,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}