In [None]:
import os
import sys
# Make sure all code is in the PATH.
sys.path.append(os.path.normpath(os.path.join('../src')))

In [None]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as ss
import seaborn as sns

import config, falcon

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#6da7de', '#9e0059', '#dee000', '#d82222', '#5ea15d',
                 '#943fa6', '#63c5b5', '#ff38ba', '#eb861e', '#ee266d'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

In [None]:
if not os.path.exists('nn_dist_scaling'):
    os.mkdir('nn_dist_scaling')

config.charges = (2,)
config.overwrite = False
config.export_representatives = False
config.pxd = 'PXD000561'
config.peak_dir = os.path.abspath('../data/external/PXD000561')
config.work_dir = os.path.abspath('./tmp')
config.filenames = [os.path.join(config.peak_dir, filename)
                    for filename in os.listdir(config.peak_dir)
                    if filename.endswith('.mzML')]
# Generate pairwise distance matrixes with different scaling methods.
for scaling in ('rank', 'root', 'log', None):
    if not os.path.isfile(f'./nn_dist_scaling/dist_2_{scaling}.npz'):
        config.scaling = scaling
        falcon.main()
        os.rename('./tmp/nn/dist_2.npz',
                  f'./nn_dist_scaling/dist_2_{scaling}.npz')
        os.rename('./tmp/nn/metadata_2.parquet',
                  f'./nn_dist_scaling/metadata_2_{scaling}.parquet')
if os.path.exists('./tmp'):
    os.rmdir('./tmp')

In [None]:
ids = pd.read_parquet('kim2014_ids.parquet')
ids['sequence'] = ids['sequence'].str.replace('L', 'I')

In [None]:
dists = {scaling: ss.load_npz(f'./nn_dist_scaling/dist_2_{scaling}.npz')
         for scaling in ('rank', 'root', 'log', None)}

In [None]:
width = 7
height = width / 1.618
fig, axes = plt.subplots(2, 2, figsize=(width * 2, height * 2))

for ax, (scaling, pairwise_distances) in zip(axes.ravel(), dists.items()):
    metadata = pd.merge(
        pd.read_parquet(f'./nn_dist_scaling/metadata_2_{scaling}.parquet'),
        ids[['identifier', 'sequence']], on='identifier')
    
    rows, columns, dist = ss.find(pairwise_distances)
    sequences = metadata['sequence'].reset_index(drop=True)
    same_label = (sequences.loc[rows].reset_index(drop=True) ==
                  sequences.loc[columns].reset_index(drop=True))
    
    sns.kdeplot(dist[same_label], fill=True, ax=ax)
    sns.kdeplot(dist[~same_label], fill=True, ax=ax)
    
    ax.set_xlabel('Cosine distance')
    ax.set_title(f'scaling = {scaling}')
    sns.despine(ax=ax)

plt.tight_layout()

plt.savefig('nn_dist_scaling.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()