In [None]:
import re

import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tqdm.notebook as tqdm
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#9e0059', '#6da7de', '#ee266d', '#dee000', '#eb861e'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

In [None]:
# Read all annotations.
task_ids = ['308b3393', '18cf4e52', 'c0249eb6', 'debd3bbb',
            '8cdb4d7d', 'a9e7e4b1', '334ed0d9', 'b55aef34']
filename = ('/home/wout/Projects/suspect_list/data/processed/'
            'MOLECULAR-LIBRARYSEARCH-V2-{}-view_all_annotations_DB-main.tsv.xz')
annotations = (pd.concat([pd.read_csv(filename.format(task_id), sep='\t')
                          for task_id in task_ids])
               .reset_index())
annotations['full_CCMS_path'] = 'f.' + annotations['full_CCMS_path']

In [None]:
# Combine annotations with ReDU metadata.
redu_sample_info = pd.read_csv('http://redu.ucsd.edu/dump', sep='\t')
annotations_redu = pd.merge(
    annotations,
    redu_sample_info,
    left_on='full_CCMS_path',
    right_on='filename'
)

In [None]:
# Filter by ReDU annotations with location information.
annotations_redu = annotations_redu[
    ~annotations_redu['LatitudeandLongitude'].isin(
        ['not specified', 'not applicable', 'not applicable|not applicable']
    )
]
# Filter by human samples.
annotations_redu = annotations_redu[
    annotations_redu['NCBITaxonomy'] == '9606|Homo sapiens'
]

In [None]:
# Filter by curated list of drug names.
drug_names = pd.read_csv(
    '../data/external/broad_institute_drug_list.csv',
    usecols=['pert_iname'],
    squeeze=True
)
drug_names_re = '|'.join(
    [re.escape(drug) for drug in drug_names.str.lower().values]
)

annotations_redu_drug = annotations_redu[
    annotations_redu['Compound_Name'].str.lower().str.contains(drug_names_re)
].reset_index(drop=True)


# Clean drug names.
drug_names_mapping = pd.read_csv(
    "../data/external/drug_mapping.csv",
    index_col="name_old",
    usecols=["name_old", "name_new"],
    squeeze=True).to_dict()
annotations_redu_drug.replace(drug_names_mapping, inplace=True)

In [None]:
annotations_redu_drug.to_csv(
    "../data/interim/annotations_redu_drug.tsv.xz", sep="\t"
)

In [None]:
min_locations = 5

with PdfPages("pharmaceuticals_exposure.pdf") as pdf:
    for compound_name in tqdm.tqdm(
            sorted(annotations_redu_drug["Compound_Name"].unique())
        ):
        lats, lons = [], []
        compound_lat_lon = annotations_redu_drug.loc[
            annotations_redu_drug["Compound_Name"] == compound_name,
            "LatitudeandLongitude"
        ].dropna().unique()
        for lat_lon in compound_lat_lon:
            if '|' not in lat_lon:
                continue
            try:
                lat, lon = (float(l) for l in lat_lon.split("|"))
                lats.append(lat)
                lons.append(lon)
            except ValueError:
                # Non-numeric latitude/longitude.
                pass
        
        if len(lats) < min_locations:
            continue
        
        fig, ax = plt.subplots(figsize=(10, 10))
        ax = plt.subplot(projection=ccrs.PlateCarree())

        ax.coastlines()
        
        ax.scatter(lons, lats, marker=".", transform=ccrs.PlateCarree())

        ax.set_title(compound_name)

        ax.set_global()

#         plt.savefig(f"{compound_name}.png", dpi=300, bbox_inches="tight")
#         plt.show()
        pdf.savefig(bbox_inches="tight")
        plt.close()