In [128]:
import pandas as pd
import numpy as np
import os
import warnings
import csv
import collections

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

tool_usage_file = "data/tool-popularity-19-09.tsv"
df_usage = pd.read_csv(tool_usage_file, sep="\t", header=None)

In [129]:
df_usage

Unnamed: 0,0,1,2
0,upload1,2019-09-01,14291
1,toolshed.g2.bx.psu.edu/repos/devteam/bwa/bwa_m...,2019-09-01,2107
2,toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fa...,2019-09-01,1654
3,toolshed.g2.bx.psu.edu/repos/peterjc/blast_rbh...,2019-09-01,1429
4,CONVERTER_gz_to_uncompressed,2019-09-01,1054
...,...,...,...
23045,toolshed.g2.bx.psu.edu/repos/devteam/samtools_...,2017-09-01,1
23046,toolshed.g2.bx.psu.edu/repos/iuc/ngsutils_bam_...,2017-09-01,1
23047,toolshed.g2.bx.psu.edu/repos/devteam/cuffnorm/...,2017-09-01,1
23048,toolshed.g2.bx.psu.edu/repos/iuc/unicycler/uni...,2017-09-01,1


In [130]:
tool_ids = list()
tool_versions = list()
tool_update_date = list()
tool_usage = list()
tool_ids_usage = dict()
cutoff_date = '2017-12-01'
for item in df_usage.values.tolist():
    wid = item[0]
    if (str(item[1]) > cutoff_date) is True:
        wf_time_usage = {item[1]: item[2]}
        if wid in tool_ids_usage:
            tool_ids_usage[wid].append(wf_time_usage)
        else:
            tool_ids_usage[wid] = list()
            tool_ids_usage[wid].append(wf_time_usage)
print(tool_ids_usage)

{'upload1': [{'2019-09-01': 14291}, {'2019-08-01': 69933}, {'2019-07-01': 31034}, {'2019-06-01': 45585}, {'2019-05-01': 35398}, {'2019-04-01': 29069}, {'2019-03-01': 19383}, {'2019-02-01': 17276}, {'2019-01-01': 15615}, {'2018-12-01': 11547}, {'2018-11-01': 10137}, {'2018-10-01': 16161}, {'2018-09-01': 8503}, {'2018-08-01': 19262}, {'2018-07-01': 6346}, {'2018-06-01': 4940}, {'2018-05-01': 7272}, {'2018-04-01': 5266}, {'2018-03-01': 3233}, {'2018-02-01': 4689}, {'2018-01-01': 2045}], 'toolshed.g2.bx.psu.edu/repos/devteam/bwa/bwa_mem/0.7.17.1': [{'2019-09-01': 2107}, {'2019-08-01': 1068}, {'2019-07-01': 894}, {'2019-06-01': 294}, {'2019-05-01': 2344}, {'2019-04-01': 453}, {'2019-03-01': 1176}, {'2019-02-01': 1871}, {'2019-01-01': 1193}, {'2018-12-01': 649}, {'2018-11-01': 1088}, {'2018-10-01': 1291}, {'2018-09-01': 111}, {'2018-08-01': 116}, {'2018-07-01': 315}, {'2018-06-01': 187}, {'2018-05-01': 410}, {'2018-04-01': 549}, {'2018-03-01': 129}, {'2018-02-01': 78}, {'2018-01-01': 50}], '

In [131]:
def learn_tool_popularity(x_reshaped, y_reshaped):
    """
    Fit a curve for the tool usage over time to predict future tool usage
    """
    epsilon = 0.0
    cv = 5
    s_typ = 'neg_mean_absolute_error'
    n_jobs = 4
    s_error = 1
    iid = True
    tr_score = False
    try:
        pipe = Pipeline(steps=[('regressor', SVR(gamma='scale'))])
        param_grid = {
            'regressor__kernel': ['rbf', 'poly', 'linear'],
            'regressor__degree': [2, 3]
        }
        search = GridSearchCV(pipe, param_grid, iid=iid, cv=cv, scoring=s_typ, n_jobs=n_jobs, error_score=s_error, return_train_score=tr_score)
        search.fit(x_reshaped, y_reshaped.ravel())
        model = search.best_estimator_
        # set the next time point to get prediction for
        prediction_point = np.reshape([x_reshaped[-1][0] + 1], (1, 1))
        prediction = model.predict(prediction_point)
        if prediction < epsilon:
            prediction = [epsilon]
        return prediction[0]
    except Exception:
        return epsilon
    

def get_pupularity_prediction(tools_usage):
        """
        Get the popularity prediction for each tool
        """
        ctr = 0
        usage_prediction = dict()
        for tool_name, usage in tools_usage.items():
            y_val = list()
            x_val = list()
            for item in usage:
                for key in item:
                    y_val.append(item[key])
            y_val = list(reversed(y_val))
            x_pos = np.arange(len(y_val))
            x_reshaped = x_pos.reshape(len(x_pos), 1)
            y_reshaped = np.reshape(y_val, (len(x_pos), 1))
            #prediction = np.round(learn_tool_popularity(x_reshaped, y_reshaped), 8)
            prediction = np.round(np.median(y_val), 0)
            usage_prediction[tool_name] = prediction
        return usage_prediction

usage_prediction = get_pupularity_prediction(tool_ids_usage)
print(usage_prediction)

{'upload1': 14291.0, 'toolshed.g2.bx.psu.edu/repos/devteam/bwa/bwa_mem/0.7.17.1': 549.0, 'toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.72+galaxy1': 3360.0, 'toolshed.g2.bx.psu.edu/repos/peterjc/blast_rbh/blast_reciprocal_best_hits/0.1.11': 11.0, 'CONVERTER_gz_to_uncompressed': 1155.0, 'toolshed.g2.bx.psu.edu/repos/iuc/minimap2/minimap2/2.17': 455.0, 'toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/2.3.4.3+galaxy0': 759.0, '__SET_METADATA__': 984.0, 'toolshed.g2.bx.psu.edu/repos/iuc/unicycler/unicycler/0.4.7.0': 288.0, 'toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastp_wrapper/0.3.3': 616.0, 'toolshed.g2.bx.psu.edu/repos/bgruening/autodock_vina/docking/0.2.1': 318.0, 'toolshed.g2.bx.psu.edu/repos/devteam/fasta_to_tabular/fasta2tab/1.1.0': 61.0, 'toolshed.g2.bx.psu.edu/repos/bgruening/trim_galore/trim_galore/0.4.3.1': 456.0, 'toolshed.g2.bx.psu.edu/repos/devteam/fasta_filter_by_length/fasta_filter_by_length/1.1': 67.0, 'toolshed.g2.bx.psu.edu/repos/devteam/fas

In [132]:
usage_prediction

{'upload1': 14291.0,
 'toolshed.g2.bx.psu.edu/repos/devteam/bwa/bwa_mem/0.7.17.1': 549.0,
 'toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.72+galaxy1': 3360.0,
 'toolshed.g2.bx.psu.edu/repos/peterjc/blast_rbh/blast_reciprocal_best_hits/0.1.11': 11.0,
 'CONVERTER_gz_to_uncompressed': 1155.0,
 'toolshed.g2.bx.psu.edu/repos/iuc/minimap2/minimap2/2.17': 455.0,
 'toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/2.3.4.3+galaxy0': 759.0,
 '__SET_METADATA__': 984.0,
 'toolshed.g2.bx.psu.edu/repos/iuc/unicycler/unicycler/0.4.7.0': 288.0,
 'toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastp_wrapper/0.3.3': 616.0,
 'toolshed.g2.bx.psu.edu/repos/bgruening/autodock_vina/docking/0.2.1': 318.0,
 'toolshed.g2.bx.psu.edu/repos/devteam/fasta_to_tabular/fasta2tab/1.1.0': 61.0,
 'toolshed.g2.bx.psu.edu/repos/bgruening/trim_galore/trim_galore/0.4.3.1': 456.0,
 'toolshed.g2.bx.psu.edu/repos/devteam/fasta_filter_by_length/fasta_filter_by_length/1.1': 67.0,
 'toolshed.g2.bx.psu.edu/rep

In [133]:
from datetime import datetime, timedelta
from dateutil.relativedelta import *

tool_ids_future_dates = dict()
for tool in tool_ids_usage:
    usage = tool_ids_usage[tool]
    latest = usage[0]
    latest_date = list(latest.keys())[0]
    future_date = (datetime.strptime(latest_date, '%Y-%m-%d') + relativedelta(months=+1))
    tool_ids_future_dates[tool] = str(future_date.date())
print(tool_ids_future_dates)

{'upload1': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/devteam/bwa/bwa_mem/0.7.17.1': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.72+galaxy1': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/peterjc/blast_rbh/blast_reciprocal_best_hits/0.1.11': '2019-10-01', 'CONVERTER_gz_to_uncompressed': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/iuc/minimap2/minimap2/2.17': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/2.3.4.3+galaxy0': '2019-10-01', '__SET_METADATA__': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/iuc/unicycler/unicycler/0.4.7.0': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastp_wrapper/0.3.3': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/bgruening/autodock_vina/docking/0.2.1': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/devteam/fasta_to_tabular/fasta2tab/1.1.0': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/bgruening/trim_galore/trim_galore/0.4.3.1': '2019-10-01', 'toolshed.g2.bx.psu.edu/repos/devteam/fasta_filt

In [134]:
tools_usage_prediction = list()

for tool in usage_prediction:
    a = tool.split("/")
    if len(a) > 1:
        tool_id = a[-2]
        tool_version = a[-1]
        print(tool, tool_id, tool_version, usage_prediction[tool])
        res_str = [tool_id, tool_version, tool_ids_future_dates[tool], usage_prediction[tool]]
        tools_usage_prediction.append(res_str)

toolshed.g2.bx.psu.edu/repos/devteam/bwa/bwa_mem/0.7.17.1 bwa_mem 0.7.17.1 549.0
toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.72+galaxy1 fastqc 0.72+galaxy1 3360.0
toolshed.g2.bx.psu.edu/repos/peterjc/blast_rbh/blast_reciprocal_best_hits/0.1.11 blast_reciprocal_best_hits 0.1.11 11.0
toolshed.g2.bx.psu.edu/repos/iuc/minimap2/minimap2/2.17 minimap2 2.17 455.0
toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/2.3.4.3+galaxy0 bowtie2 2.3.4.3+galaxy0 759.0
toolshed.g2.bx.psu.edu/repos/iuc/unicycler/unicycler/0.4.7.0 unicycler 0.4.7.0 288.0
toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastp_wrapper/0.3.3 ncbi_blastp_wrapper 0.3.3 616.0
toolshed.g2.bx.psu.edu/repos/bgruening/autodock_vina/docking/0.2.1 docking 0.2.1 318.0
toolshed.g2.bx.psu.edu/repos/devteam/fasta_to_tabular/fasta2tab/1.1.0 fasta2tab 1.1.0 61.0
toolshed.g2.bx.psu.edu/repos/bgruening/trim_galore/trim_galore/0.4.3.1 trim_galore 0.4.3.1 456.0
toolshed.g2.bx.psu.edu/repos/devteam/fasta_filter_by_length/fa

toolshed.g2.bx.psu.edu/repos/rnateam/mea/mea/0.6.4.1 mea 0.6.4.1 2.0
toolshed.g2.bx.psu.edu/repos/miller-lab/genome_diversity/gd_calc_freq/1.2.0 gd_calc_freq 1.2.0 2.0
toolshed.g2.bx.psu.edu/repos/devteam/vcftools_merge/vcftools_merge/0.1.1 vcftools_merge 0.1.1 4.0
toolshed.g2.bx.psu.edu/repos/bgruening/get_online_data/get_pdb/0.1.0 get_pdb 0.1.0 4.0
toolshed.g2.bx.psu.edu/repos/bgruening/get_pdb/get_pdb/0.1.0 get_pdb 0.1.0 4.0
toolshed.g2.bx.psu.edu/repos/miller-lab/genome_diversity/gd_multiple_to_gd_genotype/1.0.0 gd_multiple_to_gd_genotype 1.0.0 4.0
toolshed.g2.bx.psu.edu/repos/miller-lab/genome_diversity/gd_pathway_image/1.1.0 gd_pathway_image 1.1.0 2.0
toolshed.g2.bx.psu.edu/repos/miller-lab/genome_diversity/gd_phylogenetic_tree/1.1.0 gd_phylogenetic_tree 1.1.0 1.0
toolshed.g2.bx.psu.edu/repos/iuc/pathview/pathview/1.24.0+galaxy0 pathview 1.24.0+galaxy0 4.0
toolshed.g2.bx.psu.edu/repos/iuc/snpsift_genesets/snpSift_geneSets/4.3.0 snpSift_geneSets 4.3.0 1.0
toolshed.g2.bx.psu.edu/re

toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_makeblastdb/0.3.0 ncbi_makeblastdb 0.3.0 82.0
toolshed.g2.bx.psu.edu/repos/devteam/gi2taxonomy/Fetch Taxonomic Ranks/1.1.0 Fetch Taxonomic Ranks 1.1.0 6.0
toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastn_wrapper/0.3.1 ncbi_blastn_wrapper 0.3.1 90.0
toolshed.g2.bx.psu.edu/repos/iuc/ggplot2_pca/ggplot2_pca/2.2.1 ggplot2_pca 2.2.1 6.0
toolshed.g2.bx.psu.edu/repos/nilesh/rseqc/rseqc_geneBody_coverage/2.6.4.3 rseqc_geneBody_coverage 2.6.4.3 57.0
toolshed.g2.bx.psu.edu/repos/iuc/bedtools/bedtools_subtractbed/2.27.1 bedtools_subtractbed 2.27.1 14.0
toolshed.g2.bx.psu.edu/repos/iuc/gemini_annotate/gemini_annotate/0.20.1+galaxy1 gemini_annotate 0.20.1+galaxy1 32.0
toolshed.g2.bx.psu.edu/repos/iuc/snpeff/snpEff_download/4.3+T.galaxy2 snpEff_download 4.3+T.galaxy2 49.0
toolshed.g2.bx.psu.edu/repos/iuc/jcvi_gff_stats/jcvi_gff_stats/0.8.4 jcvi_gff_stats 0.8.4 7.0
toolshed.g2.bx.psu.edu/repos/galaxyp/cardinal_filtering/cardina

toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_clf_metrics/sklearn_clf_metrics/1.0.0.4 sklearn_clf_metrics 1.0.0.4 32.0
toolshed.g2.bx.psu.edu/repos/devteam/count_covariates/gatk_count_covariates/0.0.5 gatk_count_covariates 0.0.5 2.0
toolshed.g2.bx.psu.edu/repos/imgteam/2d_simple_filter/ip_filter_standard/0.0.2 ip_filter_standard 0.0.2 2.0
toolshed.g2.bx.psu.edu/repos/iuc/gatk2/gatk2_variant_combine/2.8.0 gatk2_variant_combine 2.8.0 2.0
toolshed.g2.bx.psu.edu/repos/iuc/kraken2/kraken2/2.0.7_beta+galaxy0 kraken2 2.0.7_beta+galaxy0 10.0
toolshed.g2.bx.psu.edu/repos/iuc/meme_fimo/meme_fimo/4.12.0.0 meme_fimo 4.12.0.0 18.0
toolshed.g2.bx.psu.edu/repos/iuc/stacks_rxstacks/stacks_rxstacks/1.46.0 stacks_rxstacks 1.46.0 2.0
toolshed.g2.bx.psu.edu/repos/rnateam/mlocarna/mlocarna/1.8.12.0 mlocarna 1.8.12.0 2.0
toolshed.g2.bx.psu.edu/repos/bgruening/text_processing/tp_replace_in_column/1.1.0 tp_replace_in_column 1.1.0 9.0
toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_numeric_clustering/sklearn_n

toolshed.g2.bx.psu.edu/repos/devteam/complement/gops_complement_1/1.0.0 gops_complement_1 1.0.0 3.0
toolshed.g2.bx.psu.edu/repos/iuc/bedtools/bedtools_coveragebed/2.27.0.2 bedtools_coveragebed 2.27.0.2 137.0
toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_generalized_linear/sklearn_generalized_linear/1.0.0.4 sklearn_generalized_linear 1.0.0.4 54.0
toolshed.g2.bx.psu.edu/repos/bgruening/numeric_clustering/numeric_clustering/0.9 numeric_clustering 0.9 3.0
toolshed.g2.bx.psu.edu/repos/iuc/bcftools_cnv/bcftools_cnv/1.9+galaxy1 bcftools_cnv 1.9+galaxy1 50.0
toolshed.g2.bx.psu.edu/repos/iuc/bedtools/bedtools_map/2.27.1.2 bedtools_map 2.27.1.2 9.0
toolshed.g2.bx.psu.edu/repos/iuc/annotatemyids/annotatemyids/3.7.0 annotatemyids 3.7.0 98.0
toolshed.g2.bx.psu.edu/repos/iuc/bedtools/bedtools_makewindowsbed/2.27.1 bedtools_makewindowsbed 2.27.1 21.0
toolshed.g2.bx.psu.edu/repos/iuc/scanpy_plot/scanpy_plot/1.4+galaxy0 scanpy_plot 1.4+galaxy0 17.0
toolshed.g2.bx.psu.edu/repos/iuc/bedtools/bedtools_gr

toolshed.g2.bx.psu.edu/repos/iuc/jbrowse/jbrowse/1.16.4+galaxy2 jbrowse 1.16.4+galaxy2 7.0
toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_compute_matrix/deeptools_compute_matrix/2.2.3.0 deeptools_compute_matrix 2.2.3.0 5.0
toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_discriminant_classifier/sklearn_discriminant_classifier/1.0 sklearn_discriminant_classifier 1.0 7.0
toolshed.g2.bx.psu.edu/repos/iuc/bedtools/bedtools_coveragebed/2.27.0.1 bedtools_coveragebed 2.27.0.1 7.0
toolshed.g2.bx.psu.edu/repos/iuc/bedtools/bedtools_groupbybed/2.27.0.0 bedtools_groupbybed 2.27.0.0 1.0
toolshed.g2.bx.psu.edu/repos/bgruening/graphicsmagick_image_convert/graphicsmagick_image_convert/1.3.26 graphicsmagick_image_convert 1.3.26 32.0
toolshed.g2.bx.psu.edu/repos/bgruening/openbabel_compound_convert/openbabel_compound_convert/2.4.1.0 openbabel_compound_convert 2.4.1.0 2.0
toolshed.g2.bx.psu.edu/repos/peterjc/venn_list/venn_list/0.0.9 venn_list 0.0.9 6.0
toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_nn_

toolshed.g2.bx.psu.edu/repos/galaxyp/msi_classification/mass_spectrometry_imaging_classification/1.10.0.0 mass_spectrometry_imaging_classification 1.10.0.0 4.0
toolshed.g2.bx.psu.edu/repos/galaxyp/openms_demeanderize/DeMeanderize/2.3.0 DeMeanderize 2.3.0 1.0
toolshed.g2.bx.psu.edu/repos/galaxyp/openms_inspectadapter/InspectAdapter/2.3.0 InspectAdapter 2.3.0 1.0
toolshed.g2.bx.psu.edu/repos/galaxyp/openms_masstraceextractor/MassTraceExtractor/2.2.0 MassTraceExtractor 2.2.0 1.0
toolshed.g2.bx.psu.edu/repos/galaxyp/openms_metaprosip/MetaProSIP/2.3.0 MetaProSIP 2.3.0 1.0
toolshed.g2.bx.psu.edu/repos/galaxyp/openms_openswathmzmlfilecacher/OpenSwathMzMLFileCacher/2.3.0 OpenSwathMzMLFileCacher 2.3.0 1.0
toolshed.g2.bx.psu.edu/repos/galaxyp/openms_precursormasscorrector/PrecursorMassCorrector/2.3.0 PrecursorMassCorrector 2.3.0 1.0
toolshed.g2.bx.psu.edu/repos/galaxyp/openms_ticcalculator/TICCalculator/2.3.0 TICCalculator 2.3.0 2.0
toolshed.g2.bx.psu.edu/repos/galaxyp/quantp/quantp/1.0.0 quantp

In [135]:
tools_usage_prediction = pd.DataFrame(tools_usage_prediction)
tools_usage_prediction.columns = ["tool_id", "tool_v", "future_date", "usage"]
tools_usage_prediction.to_csv("data/tools_usage_prediction_median.csv", sep=",", index=False)