# worksheet 11b: Proteomics dataset
- Case study: https://www.nature.com/articles/srep37871
### Malaria data
- quantify celebral protein expression of 748 proteins in 12 mice (samples) using mass spec experiments
- 6 mice, two parallel experimental sets (n=12)
- In each experiment: 2 mice = non-infected controls, 2 mice = day 3 post infection, 2 mice = ECM or experimental celebral malaria (day 8 post infection)
- protein expression in log2 base format 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
malaria_data = pd.read_csv('malaria.csv', index_col=0)

In [None]:
malaria_data.shape

In [None]:
malaria_data.head()

In [None]:
malaria_data.columns

In [None]:
malaria_data.dtypes

### overall statistics

In [None]:
malaria_data.describe()

### get count of null values per sample

In [None]:
def count_nulls(col):
    return col.loc[col.isna()].shape

In [None]:
null_count = malaria_data.apply(
    lambda col: count_nulls(col)
)

In [None]:
null_count.index = ['null_count']

In [None]:
null_count

In [None]:
null_count.loc['null_count']

### plot null values per sample
- plot series

In [None]:
fig, ax = plt.subplots()
ax.plot(
    null_count.loc['null_count'].index, 
    null_count.loc['null_count'].values, 
    'ro', 
    label='null count per sample'
)
ax.tick_params(axis='x', rotation=60, labelsize=5)


### plot raw mean protein expression per sample

In [None]:
malaria_data.describe()

In [None]:
malaria_data.describe().loc['mean']

In [None]:
fig, ax = plt.subplots()
ax.plot(
    malaria_data.describe().loc['mean'].index,
    malaria_data.describe().loc['mean'].values,
    'ro'
)
ax.tick_params(axis='x', rotation=60, labelsize=5)

### boxplot of sample names and raw protein expression

In [None]:
malaria_data.head()

In [None]:
malaria_melted = malaria_data.melt(
    value_vars=malaria_data.columns
)
malaria_melted.columns=['sample','log2_expr']

In [None]:
fig, ax = plt.subplots()
malaria_melted.boxplot(
    column='log2_expr', by='sample', ax=ax, notch=True
)
ax.tick_params(axis='x', rotation=60, labelsize=5)
ax.set_title('raw protein expression by sample')
