# Import

CSV -> raw `DataFrames`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 10)

In [None]:
sample_information_df = pd.read_csv('sample-information.csv')
mRNA_raw = pd.read_csv('mRNA.csv')
protein_raw = pd.read_csv('protein.csv')
phosphorylated_protein_raw = pd.read_csv('phosphorylated-protein.csv')
metabolites_raw = pd.read_csv('metabolites.csv')

In [None]:
sample_information_df

In [None]:
mRNA_raw

In [None]:
protein_raw

In [None]:
metabolites_raw

# Subsetting

## Genes/proteins: flavoproteins only

In [None]:
# import
flavoproteins = pd.read_csv('flavoproteins-gudipati2014-ho2018.csv')

# convenience functions --only works for flavoproteins
def sgd_to_ensembl(name):
    return flavoproteins.ensembl_gene_id[flavoproteins.sgd_name == name].to_list()[0]

def ensembl_to_sgd(name):
    return flavoproteins.sgd_name[flavoproteins.ensembl_gene_id == name].to_list()[0]

flavoproteins

In [None]:
# Bar chart of flavoprotein abundances
%matplotlib qt

top = 18

plt.barh(list(reversed(flavoproteins[0:top].sgd_name.to_list())),
         list(reversed(flavoproteins[0:top].mean_abundance.to_list())))
plt.xlabel('Mean protein abundance (molecules/cell)')
plt.ylabel('Protein')
plt.yticks(fontsize = 8)
plt.show()

In [None]:
mRNA_flavo_bool = mRNA_raw.ensembl_gene_id.apply(lambda x: x in flavoproteins.ensembl_gene_id.to_list())
mRNA_flavo = mRNA_raw[mRNA_flavo_bool]
mRNA_flavo

In [None]:
protein_flavo_bool = protein_raw.ensembl_gene_id.apply(lambda x: x in flavoproteins.ensembl_gene_id.to_list())
protein_flavo = protein_raw[protein_flavo_bool]
protein_flavo

## Metabolites: lipid only

In [None]:
metabolites_lipid = metabolites_raw[metabolites_raw.super_pathway == 'Lipid']
metabolites_lipid

# Data wrangling

(why did they put _all_ replicates into columns is beyond me...)

Intended result

- Only one column on the left that functions as ID
- Rows indicate genes/proteins/metabolites.  Columns indicate time

    - They indicate time rather than time points because time point 7 is inconsistent between replicates.  (Why are they like this?!)
    
- Add a column that corresponds to biological replicate.

In [None]:
# Convert `sample_information_df` to a dictionary: Sample --> replicate & time
# Reason: looking up the sample_information_df every time is slow.
sample_information = dict(zip(
    sample_information_df.Sample.to_list(),
    sample_information_df[['Biological replicate', 'Time after arrest and release (minutes)']].to_numpy().tolist()
))

In [None]:
# massage imported df into the form we want

def df_wrangle(raw_df, id_column):
    '''Massages the imported DataFrames into the form I want
    
    Parameters
    ----------
    raw_df: pandas.DataFrame
        raw imported DataFrame from supplementary information
    id_column: string
        the column to retain to ID the time series
    '''
    # get the columns that don't begin with S
    raw_columns = raw_df.columns.to_list()
    metadata_columns = np.array(raw_columns)[np.invert(np.char.startswith(raw_columns, 'S'))].tolist()
    # melt
    df = raw_df.melt(id_vars = metadata_columns, var_name = 'Sample')
    # convert Sample to replicate and timepoint, using sample_information dictionary
    df = pd.concat(
        [
            df[id_column],
            pd.DataFrame(
                df.Sample.apply(lambda x: sample_information[x]).to_list(),
                columns = ['replicate', 'time']
            ),
            df['value']
        ], axis = 1
    )
    # put id_column, replicate, timepoint into multiindex
    df = pd.DataFrame(
        df.value.to_list(),
        index = pd.MultiIndex.from_frame(df[['replicate', id_column, 'time']]),
        columns = ['value']
    )
    # unstack
    df = df.unstack('time')

    return df

mRNA_df = df_wrangle(mRNA_flavo, 'ensembl_gene_id')
protein_df = df_wrangle(protein_flavo, 'ensembl_gene_id')
phosphorylated_protein_df = df_wrangle(phosphorylated_protein_raw, 'uniprotswissprot_unique')
metabolites_df = df_wrangle(metabolites_lipid, 'metabolite')

In [None]:
mRNA_df

In [None]:
protein_df

In [None]:
phosphorylated_protein_df

In [None]:
metabolites_df

# Visualisation

Define functions

In [None]:
from matplotlib import cm

cdc_def = pd.DataFrame({
    'start_time': [0, 21, 50, 80.5, 98, 126.5, 148],
    'end_time': [21, 50, 80.5, 98, 126.5, 148, 158],
    'phase': ['G1', 'S', 'G2/M', 'G1', 'S', 'G2/M', 'G1'],
})

cdc_colourmap = {
    'G1': cm.Pastel1(0),
    'S': cm.Pastel1(1),
    'G2/M': cm.Pastel1(2),
}

def plot_genes(df, sgd_name):
    name = sgd_to_ensembl(sgd_name)
    for replicate in [1,2,3]:
        plt.plot(
            df.loc[(replicate, name)].index.get_level_values('time').to_numpy(),
            df.loc[(replicate, name)].to_numpy()
        )
    # shade by cdc phase
    for row_index in list(range(len(cdc_def))):
        cdc_phase = cdc_def['phase'][row_index]
        plt.axvspan(
            cdc_def['start_time'][row_index],
            cdc_def['end_time'][row_index],
            facecolor = cdc_colourmap[cdc_phase]
        )
    plt.xlabel('Time (min)')
    plt.ylabel('Expression')
    plt.title(sgd_name)
    plt.show()
    
def plot_replicates(df, name):
    for replicate in [1,2,3]:
        plt.plot(
            df.loc[(replicate, name)].index.get_level_values('time').to_numpy(),
            df.loc[(replicate, name)].to_numpy()
        )
    # shade by cdc phase
    for row_index in list(range(len(cdc_def))):
        cdc_phase = cdc_def['phase'][row_index]
        plt.axvspan(
            cdc_def['start_time'][row_index],
            cdc_def['end_time'][row_index],
            facecolor = cdc_colourmap[cdc_phase]
        )
    plt.xlabel('Time (min)')
    plt.ylabel('Expression')
    plt.title(name)
    plt.show()

#plot_genes(mRNA_df, 'fas1')

def heatmap_genes(df):
    # Compute average df across replicates & rearrange by flavoprotein abundance
    df_avg = pd.concat([df.loc[replicate] for replicate in [1,2,3]]).groupby(level=0).mean()
    df_avg = df_avg.reindex(flavoproteins.ensembl_gene_id.to_list())
    
    # Define time ticks, accounting for uneven time axis
    time_axis = df.columns.get_level_values('time').to_numpy()
    time_ticks = (time_axis[1:] + time_axis[:-1])/2
    np.insert(time_ticks, 0, 0)
    np.append(time_ticks, time_axis[-1])
    vert_axis = np.linspace(len(df_avg)-1, 0, len(df_avg))

    # Meshgrid
    X, Y = np.meshgrid(time_axis, vert_axis)
    plt.pcolormesh(X, Y, df_avg)
    
    # Labels
    plt.xlabel('Time (min)')
    plt.yticks(
        vert_axis,
        df_avg.index.get_level_values('ensembl_gene_id').to_list()
    )
    plt.show()

def heatmap_metabolites(df):
    # Compute average df across replicates
    df_avg = pd.concat([df.loc[replicate] for replicate in [1,2,3]]).groupby(level=0).mean()
    
    # Define time ticks, accounting for uneven time axis
    time_axis = df.columns.get_level_values('time').to_numpy()
    time_ticks = (time_axis[1:] + time_axis[:-1])/2
    np.insert(time_ticks, 0, 0)
    np.append(time_ticks, time_axis[-1])
    vert_axis = np.linspace(len(df_avg)-1, 0, len(df_avg))

    # Meshgrid
    X, Y = np.meshgrid(time_axis, vert_axis)
    plt.pcolormesh(X, Y, df_avg)
    
    # Labels
    plt.xlabel('Time (min)')
    plt.yticks(
        vert_axis,
        df_avg.index.get_level_values('metabolite').to_list()
    )
    plt.show()

## mRNA

In [None]:
np.array([ensembl_to_sgd(name) for name in mRNA_df.index.get_level_values('ensembl_gene_id')])

In [None]:
gene_list = ['fas1', 'yhb1', 'ura1', 'pst2', 'trr1', 'ilv2', 'oye2', 'dld3', 'pdx3']

for gene in gene_list:
    plot_genes(mRNA_df, gene)

In [None]:
heatmap_genes(mRNA_df)

## Protein

In [None]:
np.array([ensembl_to_sgd(name) for name in protein_df.index.get_level_values('ensembl_gene_id')])

In [None]:
gene_list = ['fas1', 'yhb1', 'ura1', 'pst2', 'trr1', 'ilv2', 'oye2', 'dld3', 'pdx3']

for gene in gene_list:
    plot_genes(protein_df, gene)

In [None]:
heatmap_genes(protein_df)

## Metabolites

In [None]:
metabolites_df.index.get_level_values('metabolite').to_list()

In [None]:
plot_replicates(metabolites_df, 'palmitate (16:0)')

In [None]:
heatmap_metabolites(metabolites_df)