In [1]:
import pandas as pd
import json
from bokeh.palettes import Spectral11
from bokeh.io import output_notebook, output_file, save
from bokeh.plotting import figure, show

In [2]:
output_notebook()

In [3]:

def create_pandas_df(path):
    # Read the csv file
    df = pd.read_csv(
        filepath_or_buffer=path,
        index_col=False,
        sep='\s+'  # Split by whitespace
    )

    # Move the columns to deal with the leading hashtag
    df_mod = df[df.columns[:-1]]
    df_mod.columns = df.columns[1:]

    return df_mod

def read_maximme_rdf(json_metadata_path, char_types):
    """Construct dataframes with the needed metadata attached."""

    # Open the file.
    with open(json_metadata_path, 'r') as f:
        metadata = json.load(f)

    # Create an empty list to append the paths to.
    data_frame_list = []

    # Iterate through the studies.
    for studies in metadata['studies']:
        for assays in studies['assays']:
            for char in assays['characteristicCategories']:
                if char['characteristicType']['termSource'] in char_types:
                    # We found a matching termSource, so get the annotationValue
                    dimer_type = char['characteristicType']['annotationValue']
                    for data_file in assays['dataFiles']:
                        # Create the new dataframe
                        new_data_frame = create_pandas_df(data_file['name'])
                        # Add the desired metadata to the data frame:
                        new_data_frame['dimer'] = dimer_type
                        data_frame_list.append(new_data_frame)
    data_frame =  pd.concat(data_frame_list, ignore_index=True)

    data_frame = data_frame.melt(
        id_vars=["r", 'dimer'],
        value_vars=['RDF_Al-Ob', 'RDF_Al-Oh',],# 'RCN_Al-Ob', 'RCN_Al-Oh'],
        value_name="inter atom distance",
        var_name="Atom Pair"
    )

    return data_frame

In [4]:
df = read_maximme_rdf('data/nmr_metadata.json', char_types=['Aluminate Species'])

In [8]:
df

Unnamed: 0,r,dimer,Atom Pair,inter atom distance
0,0.00,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000
1,0.02,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000
2,0.04,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000
3,0.06,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000
4,0.08,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000
5,0.10,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000
6,0.12,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000
7,0.14,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000
8,0.16,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000
9,0.18,(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+,RDF_Al-Ob,0.0000


In [7]:
df['Atom Pair'].unique()

array(['RDF_Al-Ob', 'RDF_Al-Oh'], dtype=object)

In [27]:
df2 = df.groupby(['Atom Pair', 'dimer']).groups

In [29]:
df2

{('RCN_Al-Ob',
  ' (OH)3Al-(OH)-Al(OH)3- + 179 H2O + HO- + 2 Na+'): Int64Index([1804, 1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813,
             ...
             1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005],
            dtype='int64', length=202),
 ('RCN_Al-Ob',
  '(OH)2Al-O2-Al(OH)2- + 181 H2O + 2 Na+'): Int64Index([2206, 2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215,
             ...
             2396, 2397, 2398, 2399, 2400, 2401, 2402, 2403, 2404, 2405],
            dtype='int64', length=200),
 ('RCN_Al-Ob',
  '(OH)3Al-(OH)2 -Al(OH)3- + 179 H2O + 2 Na+'): Int64Index([2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
             ...
             2196, 2197, 2198, 2199, 2200, 2201, 2202, 2203, 2204, 2205],
            dtype='int64', length=200),
 ('RCN_Al-Ob',
  '(OH)3Al-O-Al(OH)32-+ 180 H2O + 2 Na+'): Int64Index([1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613,
             ...
             1794, 1795, 1796, 1797, 1798, 1799, 1800,

In [18]:
num_lines = len(df.columns)
my_palette = Spectral11[0:num_lines]

fig = figure()
fig.multi_line(
    xs=[df.r],
    ys=[df[xx] for xx in df],
    line_color=my_palette
)
show(fig)

