In [None]:
from virus.XGBoost import XGBoost
from virus.cross_validation.undersampling import leave_one_out_train_test_generator_random
import datetime
import os
import pandas as pd

user = os.environ['USER']
xgb = XGBoost(f'/home/{user}/workspace/wirusy')
output = 'xgboost-random-undersampling-no-mismatch-score-jaccard-%Y-%m-%d-T-%H-%M-%S'
timestamp_format = '%Y-%m-%d-T-%H-%M-%S'
now = datetime.datetime.now()
output_name = now.strftime(output)
timestamp = now.strftime(timestamp_format)
output_path = f'{xgb.PROJECT_DIR}/outputs/xgboost'
os.makedirs(output_path, exist_ok=True)

train_results = xgb.train(xgb.df, xgb.virus_groups, leave_one_out_train_test_generator_random, exclude_features=['crisprdetect_2mismatch_score', 'piler_2mismatch_score'],
                            n_estimators=20, use_label_encoder=False, n_jobs=8)


In [None]:
xgb.save_results_to_csv(train_results, xgb.virus_groups, xgb.df, f'{output_path}/{output_name}.csv', exclude_features=['crisprdetect_2mismatch_score', 'piler_2mismatch_score'])
xgb.save_data_to_pickle(train_results, f'{output_path}/{output_name}.pickle')
y_data, y_predictions, pairs_proba = xgb.get_y_true_and_y_pred(f'{output_path}/{output_name}.csv')
predictions_df = pd.read_csv(f'{output_path}/{output_name}.csv', header=0, index_col=[0, 1])
classifiers = [x[1] for x in train_results]

In [None]:
import plotnine as p9

plot1 = xgb.plot_feature_importances(classifiers, size=(22,7), title=f'Feature Importance - Random Undersampling - No Mismatch score + Jaccard', exclude_features=['crisprdetect_2mismatch_score', 'piler_2mismatch_score'])
plot2 = xgb.plot_host_taxonomy_probability(predictions_df, size=(12,10))
plot3 = xgb.plot_precision_recall_curve(y_data, y_predictions, size=(12,10))
plot4 = xgb.plot_roc_curve(y_data, y_predictions, (12,10))
plot5 = xgb.plot_boxplot_proba(pairs_proba, (12,10))

p9.save_as_pdf_pages([plot1, plot2, plot3, plot4, plot5], f'{output_path}/{output_name}.pdf')