# Index Differences Between Fields
This notebook looks at how papers have different pageranks between fields

In [None]:
import glob
import pickle as pkl

import numpy as np
import pandas as pd
import plotly.express as px

In [None]:
def calculate_percentiles(true_vals, doi_to_shuffled_metrics):
    dois, pageranks = [], []
    for doi, pagerank in true_vals.items():
        if pagerank is not None:
            dois.append(doi)
            pageranks.append(pagerank)
            
    percentiles = []
    for doi in dois:
        shuffled_metrics = doi_to_shuffled_metrics[doi]
        # If the node is unshuffleable for some reason, its percentile isn't meaningful
        if len(set(shuffled_metrics)) == 1:
            percentiles.append(None)
            continue
        true_val = true_vals[doi]
    
        percentile = np.searchsorted(shuffled_metrics, true_val) / 100
        percentiles.append(percentile)

    result_df = pd.DataFrame({'doi': dois, 'pagerank': pageranks, 'percentile': percentiles})
    return result_df

## Load AI results

In [None]:
ai_shuffled = glob.glob('../output/shuffle_results/artificial_intelligence_*-pagerank.pkl')

doi_to_shuffled_metrics = {}

for path in ai_shuffled:
    with open(path, 'rb') as in_file:
        result = pkl.load(in_file)
        for doi, value in result.items():
            if doi in doi_to_shuffled_metrics:
                doi_to_shuffled_metrics[doi].append(value)
            else:
                doi_to_shuffled_metrics[doi] = [value]
                

In [None]:
for doi, vals in doi_to_shuffled_metrics.items():
    doi_to_shuffled_metrics[doi] = sorted(vals)

In [None]:
with open('../output/artificial_intelligence-pagerank.pkl', 'rb') as in_file:
    true_vals = pkl.load(in_file)

In [None]:
ai_df = calculate_percentiles(true_vals, doi_to_shuffled_metrics)
ai_df

In [None]:
ai_df['percentile'].hist()

In [None]:
ai_df['percentile'].value_counts() / len(ai_df)

## Load comp bio resuls

In [None]:
comp_bio_shuffled = glob.glob('../output/shuffle_results/computational_biology_*-pagerank.pkl')

doi_to_shuffled_metrics = {}

for path in comp_bio_shuffled:
    with open(path, 'rb') as in_file:
        result = pkl.load(in_file)
        for doi, value in result.items():
            if doi in doi_to_shuffled_metrics:
                doi_to_shuffled_metrics[doi].append(value)
            else:
                doi_to_shuffled_metrics[doi] = [value]
                

In [None]:
for doi, vals in doi_to_shuffled_metrics.items():
    doi_to_shuffled_metrics[doi] = sorted(vals)

In [None]:
with open('../output/computational_biology-pagerank.pkl', 'rb') as in_file:
    true_vals = pkl.load(in_file)
    
comp_bio_df = calculate_percentiles(true_vals, doi_to_shuffled_metrics)
comp_bio_df


In [None]:
comp_bio_df['percentile'].hist()

## Load Genetics results

In [None]:
genetics_shuffled = glob.glob('../output/shuffle_results/genetics_*-pagerank.pkl')

doi_to_shuffled_metrics = {}

for path in genetics_shuffled:
    with open(path, 'rb') as in_file:
        result = pkl.load(in_file)
        for doi, value in result.items():
            if doi in doi_to_shuffled_metrics:
                doi_to_shuffled_metrics[doi].append(value)
            else:
                doi_to_shuffled_metrics[doi] = [value]
                

In [None]:
for doi, vals in doi_to_shuffled_metrics.items():
    doi_to_shuffled_metrics[doi] = sorted(vals)

In [None]:
with open('../output/genetics-pagerank.pkl', 'rb') as in_file:
    true_vals = pkl.load(in_file)
    
genetics_df = calculate_percentiles(true_vals, doi_to_shuffled_metrics)
genetics_df


In [None]:
genetics_df['percentile'].hist()

## Merge ai and comp bio and plot results

In [None]:
merged_df = comp_bio_df.merge(ai_df, on='doi')
merged_df = merged_df.rename({'pagerank_x': 'comp_bio_pagerank', 'pagerank_y': 'ai_pagerank',
                              'percentile_x': 'comp_bio_percentile', 'percentile_y': 'ai_percentile'}, 
                             axis='columns')
merged_df['cb-ai'] = merged_df['comp_bio_percentile'] - merged_df['ai_percentile']

merged_df

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='comp_bio_percentile', title='Comp bio percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='ai_percentile', title='ai percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.5, color='cb-ai', color_continuous_scale='RdBu', hover_data=['doi'],
                 title='Relative importance of papers in comp bio and ai')
fig.show()

In [None]:
# Get papers that did surprisingly well in comp bio but surprisingly poorly in ai
merged_df.sort_values(by='cb-ai', ascending=False).head(5)

In [None]:
merged_df.sort_values(by='cb-ai', ascending=True).head(5)

Papers that did very well in CB and very poorly in ai include:
- "Symtosis: A liver ultrasound tissue characterization and risk stratification in optimized deep learning paradigm"
- "Automated detection of COVID-19 cases using deep neural networks with X-ray images"
- "COVID-19 detection using deep learning models to exploit Social Mimic Optimization and structured chest X-ray images using fuzzy color and stacking approache"
- "Application of deep learning technique to manage COVID-19 in routine clinical practice using CT images: Results of 10 convolutional neural networks"
- "PCA-based polling strategy in machine learning framework for coronary artery disease risk assessment in intravascular ultrasound: A link between carotid and coronary grayscale plaque morphology"

Papers that did well in AI but poorly in CB:
- "Activities at the Universal Protein Resource (UniProt)"
- "Lipidomic data analysis: Tutorial, practical guidelines and applications"
- "A Novel Method for the Simultaneous Enrichment, Identification, and Quantification of Phosphopeptides and Sialylated Glycopeptides Applied to a Temporal Profile of Mouse Brain Development*"
- "Peptide Fingerprinting of Alzheimer's Disease in Cerebrospinal Fluid: Identification and Prospective Evaluation of New Synaptic Biomarkers"
- "CliqueMS: a computational tool for annotating in-source metabolite ions from LC-MS untargeted metabolomics data based on a coelution similarity network "


## Merge genetics and comp bio and plot results

In [None]:
merged_df = comp_bio_df.merge(genetics_df, on='doi')
merged_df = merged_df.rename({'pagerank_x': 'comp_bio_pagerank', 'pagerank_y': 'genetics_pagerank',
                              'percentile_x': 'comp_bio_percentile', 'percentile_y': 'genetics_percentile'}, 
                             axis='columns')
merged_df['cb-genetics'] = merged_df['comp_bio_percentile'] - merged_df['genetics_percentile']

merged_df

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='genetics_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='comp_bio_percentile', title='Comp bio percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='genetics_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='genetics_percentile', title='Genetics percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='genetics_pagerank', log_x=True, log_y=True, 
                 opacity=.5, color='cb-genetics', color_continuous_scale='RdBu', hover_data=['doi'],
                 title='Relative importance of papers in comp bio and genetics')
fig.show()

In [None]:
# Get papers that did surprisingly well in comp bio but surprisingly poorly in genetics
merged_df.sort_values(by='cb-genetics', ascending=False).head(5)

In [None]:
merged_df.sort_values(by='cb-genetics', ascending=True).head(5)

Papers that did very well in CB and very poorly in genetics include:
- "Analysis of genomic diversity in Mexican Mestizo populations to develop genomic medicine in Mexico" 
- "Deciphering next-generation pharmacogenomics: an information technology perspective
- "A Serum Proteomic Pattern for the Detection of Colorectal Adenocarcinoma Using Surface Enhanced Laser Desorption and Ionization Mass Spectrometry
- "Mechanisms of population differentiation in seabirds"
- "Quantitative phenotyping via deep barcode sequencing

Papers that did well in genetics but not in CB:
- "Deep learning extends de novo protein modelling coverage of genomes using iteratively predicted structural constraints"
- "O-GlcNAcPRED-II: an integrated classification algorithm for identifying O-GlcNAcylation sites based on fuzzy undersampling and a K-means PCA oversampling technique"
- "DRIMM-Synteny: decomposing genomes into evolutionary conserved segments"
- "PseKRAAC: a flexible web server for generating pseudo K-tuple reduced amino acids composition"
- "GPSuc: Global Prediction of Generic and Species-specific Succinylation Sites by aggregating multiple sequence features"

## Merge ai and genetics and plot results

In [None]:
merged_df = genetics_df.merge(ai_df, on='doi')
merged_df = merged_df.rename({'pagerank_x': 'genetics_pagerank', 'pagerank_y': 'ai_pagerank',
                              'percentile_x': 'genetics_percentile', 'percentile_y': 'ai_percentile'}, 
                             axis='columns')
merged_df['genetics-ai'] = merged_df['genetics_percentile'] - merged_df['ai_percentile']

merged_df

In [None]:
fig = px.scatter(merged_df, x='genetics_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='genetics_percentile', title='Comp bio percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='genetics_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='ai_percentile', title='ai percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='genetics_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.5, color='genetics-ai', color_continuous_scale='RdBu', hover_data=['doi'],
                 title='Relative importance of papers in genetics and ai')
fig.show()

In [None]:
# Get papers that did surprisingly well in comp bio but surprisingly poorly in ai
merged_df.sort_values(by='genetics-ai', ascending=False).head(5)

In [None]:
merged_df.sort_values(by='genetics-ai', ascending=True).head(5)

Papers that did very well in Genetics and very poorly in ai include:
- "Active learning for computational chemogenomics"
- "O-GlcNAcPRED-II: an integrated classification algorithm for identifying O-GlcNAcylation sites based on fuzzy undersampling and a K-means PCA oversampling technique"
- "Identifying RNA 5-methylcytosine sites via pseudo nucleotide compositions"
- "AOD: the antioxidant protein database"
- "University of Turku in the BioNLP'11 Shared Task"

Papers that did well in AI but poorly in genetics:
- "The genomics of selection in dogs and the parallel evolution between dogs and humans"
- "Population genomic evidence for adaptive differentiation in Baltic Sea three-spined sticklebacks"
- "A Novel Method for the Simultaneous Enrichment, Identification, and Quantification of Phosphopeptides and Sialylated Glycopeptides Applied to a Temporal Profile of Mouse Brain Development*"
- "Use of resistance surfaces for landscape genetic studies: considerations for parameterization and analysis
- "Adaptations to Climate-Mediated Selective Pressures in Sheep "


## Merged results
### Comp bio vs Genetics

In [None]:
genetics_shuffled = glob.glob('../output/shuffle_results/genetics-computational_biology*-pagerank.pkl')

doi_to_shuffled_metrics = {}

for path in genetics_shuffled:
    with open(path, 'rb') as in_file:
        result = pkl.load(in_file)
        for doi, value in result.items():
            if doi in doi_to_shuffled_metrics:
                doi_to_shuffled_metrics[doi].append(value)
            else:
                doi_to_shuffled_metrics[doi] = [value]
                

In [None]:
for doi, vals in doi_to_shuffled_metrics.items():
    doi_to_shuffled_metrics[doi] = sorted(vals)

In [None]:
with open('../output/genetics-computational_biology-pagerank.pkl', 'rb') as in_file:
    true_vals = pkl.load(in_file)

In [None]:
genetics_df = calculate_percentiles(true_vals, doi_to_shuffled_metrics)
genetics_df

In [None]:
genetics_df['percentile'].hist()

In [None]:
comp_bio_shuffled = glob.glob('../output/shuffle_results/computational_biology-genetics*-pagerank.pkl')

doi_to_shuffled_metrics = {}

for path in comp_bio_shuffled:
    with open(path, 'rb') as in_file:
        result = pkl.load(in_file)
        for doi, value in result.items():
            if doi in doi_to_shuffled_metrics:
                doi_to_shuffled_metrics[doi].append(value)
            else:
                doi_to_shuffled_metrics[doi] = [value]
                

In [None]:
for doi, vals in doi_to_shuffled_metrics.items():
    doi_to_shuffled_metrics[doi] = sorted(vals)

In [None]:
with open('../output/computational_biology-genetics-pagerank.pkl', 'rb') as in_file:
    true_vals = pkl.load(in_file)

In [None]:
comp_bio_df = calculate_percentiles(true_vals, doi_to_shuffled_metrics)
comp_bio_df

In [None]:
merged_df = genetics_df.merge(comp_bio_df, on='doi')
merged_df = merged_df.rename({'pagerank_x': 'genetics_pagerank', 'pagerank_y': 'comp_bio_pagerank',
                              'percentile_x': 'genetics_percentile', 'percentile_y': 'comp_bio_percentile'}, 
                             axis='columns')
merged_df['cb-genetics'] = merged_df['comp_bio_percentile'] - merged_df['genetics_percentile']

merged_df

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='genetics_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='comp_bio_percentile', title='Comp bio percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='genetics_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='genetics_percentile', title='genetics percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='genetics_pagerank', log_x=True, log_y=True, 
                 opacity=.5, color='cb-genetics', color_continuous_scale='RdBu', hover_data=['doi'],
                 title='Relative importance of papers in comp bio and genetics')
fig.show()

In [None]:
# Get papers that did surprisingly well in comp bio but surprisingly poorly in genetics
merged_df.sort_values(by='cb-genetics', ascending=False).head(5)

In [None]:
merged_df.sort_values(by='cb-genetics', ascending=True).head(5)

### High CB-genetics:
- Nonlinear Data Alignment for UPLC−MS and HPLC−MS Based Metabolomics:  Quantitative Analysis of Endogenous and Exogenous Metabolites in Human Serum
- Plant metabolomics: from holistic hope, to hype, to hot topic
- Engineered gene circuits
- Genetics Meets Metabolomics: A Genome-Wide Association Study of Metabolite Profiles in Human Serum
- Metabonomics Techniques and Applications to Pharmaceutical Research & Development

### Low CB - genetics:
- PhylOTU: A High-Throughput Procedure Quantifies Microbial Community Diversity and Resolves Novel Taxa from Metagenomic Data
- Functional metagenomic profiling of nine biomes
- A human gut microbial gene catalogue established by metagenomic sequencing
- Phymm and PhymmBL: metagenomic phylogenetic classification with interpolated Markov models
- The Plant Ontology as a Tool for Comparative Plant Anatomy and Genomic Analyses 

### Comp bio vs AI

In [None]:
artificial_intelligence_shuffled = glob.glob('../output/shuffle_results/artificial_intelligence-computational_biology*-pagerank.pkl')

doi_to_shuffled_metrics = {}

for path in artificial_intelligence_shuffled:
    with open(path, 'rb') as in_file:
        result = pkl.load(in_file)
        for doi, value in result.items():
            if doi in doi_to_shuffled_metrics:
                doi_to_shuffled_metrics[doi].append(value)
            else:
                doi_to_shuffled_metrics[doi] = [value]
                

In [None]:
for doi, vals in doi_to_shuffled_metrics.items():
    doi_to_shuffled_metrics[doi] = sorted(vals)

In [None]:
with open('../output/artificial_intelligence-computational_biology-pagerank.pkl', 'rb') as in_file:
    true_vals = pkl.load(in_file)

In [None]:
artificial_intelligence_df = calculate_percentiles(true_vals, doi_to_shuffled_metrics)
artificial_intelligence_df

In [None]:
comp_bio_shuffled = glob.glob('../output/shuffle_results/computational_biology-artificial_intelligence*-pagerank.pkl')

doi_to_shuffled_metrics = {}

for path in comp_bio_shuffled:
    with open(path, 'rb') as in_file:
        result = pkl.load(in_file)
        for doi, value in result.items():
            if doi in doi_to_shuffled_metrics:
                doi_to_shuffled_metrics[doi].append(value)
            else:
                doi_to_shuffled_metrics[doi] = [value]
                

In [None]:
for doi, vals in doi_to_shuffled_metrics.items():
    doi_to_shuffled_metrics[doi] = sorted(vals)

In [None]:
with open('../output/computational_biology-artificial_intelligence-pagerank.pkl', 'rb') as in_file:
    true_vals = pkl.load(in_file)

In [None]:
comp_bio_df = calculate_percentiles(true_vals, doi_to_shuffled_metrics)
comp_bio_df

In [None]:
merged_df = artificial_intelligence_df.merge(comp_bio_df, on='doi')
merged_df = merged_df.rename({'pagerank_x': 'ai_pagerank', 'pagerank_y': 'comp_bio_pagerank',
                              'percentile_x': 'ai_percentile', 'percentile_y': 'comp_bio_percentile'}, 
                             axis='columns')
merged_df['cb-ai'] = merged_df['comp_bio_percentile'] - merged_df['ai_percentile']

merged_df

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='comp_bio_percentile', title='Comp bio percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='ai_percentile', title='ai percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='comp_bio_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.5, color='cb-ai', color_continuous_scale='RdBu', hover_data=['doi'],
                 title='Relative importance of papers in comp bio and ai')
fig.show()

In [None]:
# Get papers that did surprisingly well in comp bio but surprisingly poorly in ai
merged_df.sort_values(by='cb-ai', ascending=False).head(5)

In [None]:
merged_df.sort_values(by='cb-ai', ascending=True).head(5)

### High CB-ai:
- Automated generation of heuristics for biological sequence comparison
- Improving accuracy of microarray classification by a simple multi-task feature selection filter
- Construction and analysis of arrayed cDNA libraries
- Better prediction of sub-cellular localization by combining evolutionary and structural information
- Brain-Wide Genome-Wide Association Study for Alzheimer's Disease via Joint Projection Learning and Sparse Regression Model

### Low CB - ai:
- A Simple Algorithm for Identifying Negated Findings and Diseases in Discharge Summaries
- Combining multi-species genomic data for microRNA identification using a Naïve Bayes classifier 
- Deep learning of the tissue-regulated splicing code 
- Activities at the Universal Protein Resource (UniProt) 
- Enhanced Regulatory Sequence Prediction Using Gapped k-mer Features

### AI vs genetics

In [None]:
artificial_intelligence_shuffled = glob.glob('../output/shuffle_results/artificial_intelligence-genetics*-pagerank.pkl')

doi_to_shuffled_metrics = {}

for path in artificial_intelligence_shuffled:
    with open(path, 'rb') as in_file:
        result = pkl.load(in_file)
        for doi, value in result.items():
            if doi in doi_to_shuffled_metrics:
                doi_to_shuffled_metrics[doi].append(value)
            else:
                doi_to_shuffled_metrics[doi] = [value]
                

In [None]:
for doi, vals in doi_to_shuffled_metrics.items():
    doi_to_shuffled_metrics[doi] = sorted(vals)

In [None]:
with open('../output/artificial_intelligence-genetics-pagerank.pkl', 'rb') as in_file:
    true_vals = pkl.load(in_file)

In [None]:
artificial_intelligence_df = calculate_percentiles(true_vals, doi_to_shuffled_metrics)
artificial_intelligence_df

In [None]:
genetics_shuffled = glob.glob('../output/shuffle_results/genetics-artificial_intelligence*-pagerank.pkl')

doi_to_shuffled_metrics = {}

for path in genetics_shuffled:
    with open(path, 'rb') as in_file:
        result = pkl.load(in_file)
        for doi, value in result.items():
            if doi in doi_to_shuffled_metrics:
                doi_to_shuffled_metrics[doi].append(value)
            else:
                doi_to_shuffled_metrics[doi] = [value]
                

In [None]:
for doi, vals in doi_to_shuffled_metrics.items():
    doi_to_shuffled_metrics[doi] = sorted(vals)

In [None]:
with open('../output/genetics-artificial_intelligence-pagerank.pkl', 'rb') as in_file:
    true_vals = pkl.load(in_file)

In [None]:
genetics_df = calculate_percentiles(true_vals, doi_to_shuffled_metrics)
genetics_df

In [None]:
merged_df = artificial_intelligence_df.merge(genetics_df, on='doi')
merged_df = merged_df.rename({'pagerank_x': 'ai_pagerank', 'pagerank_y': 'genetics_pagerank',
                              'percentile_x': 'ai_percentile', 'percentile_y': 'genetics_percentile'}, 
                             axis='columns')
merged_df['genetics-ai'] = merged_df['genetics_percentile'] - merged_df['ai_percentile']

merged_df

In [None]:
fig = px.scatter(merged_df, x='genetics_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='genetics_percentile', title='Genetics percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='genetics_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.3, color='ai_percentile', title='AI percentiles')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='genetics_pagerank', y='ai_pagerank', log_x=True, log_y=True, 
                 opacity=.5, color='genetics-ai', color_continuous_scale='RdBu', hover_data=['doi'],
                 title='Relative importance of papers in genetics and ai')
fig.show()

In [None]:
# Get papers that did surprisingly well in genetics but surprisingly poorly in ai
merged_df.sort_values(by='genetics-ai', ascending=False).head(5)

In [None]:
merged_df.sort_values(by='genetics-ai', ascending=True).head(5)

### High genetics-ai:
- RAxML-III: a fast program for maximum likelihood-based inference of large phylogenetic trees 
- Construction and analysis of arrayed cDNA libraries
- Identification of novel genes associated with fracture healing in osteoporosis induced by Krm2 overexpression or Lrp5 deficiency
- Hierarchical Convolutional Neural Networks for Segmentation of Breast Tumors in MRI With Application to Radiogenomics
- Dragon TIS Spotter: an Arabidopsis-derived predictor of translation initiation sites in plants 

### Low genetics - ai:
- kmer-SVM: a web server for identifying predictive regulatory sequence features in genomic data sets 
- Hairpins in a Haystack: recognizing microRNA precursors in comparative genomics data 
- pDeep: Predicting MS/MS Spectra of Peptides with Deep Learning
- Hierarchical boosting: a machine-learning framework to detect and classify hard selective sweeps in human populations 
- Deep learning of the tissue-regulated splicing code 
