In [2]:
import numpy as np
import pandas as pd

import scipy.special

import bokeh_catplot

import bokeh.io
import bokeh.plotting

bokeh.io.output_notebook()

In [14]:
df_blue = pd.read_csv('FSRI_Research/blue-5mer_rd1_010.csv')

df_blue.head()

Unnamed: 0,Amino Acid,del2_actin_1_brain count,del2_actin_1_heart count,del2_actin_2_brain count,del2_actin_2_heart count,del2_gfap_1_heart count,del2_gfap_2_brain count,del2_gfap_2_heart count,del2_gfap_3_heart count,del2_hsyn_1_heart count,del2_hsyn_2_heart count,del2_hsyn_3_heart count,del2_tek_1_heart count,del2_tek_1_kidney count,del2_tek_2_kidney count
0,#ALMP,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,#FLAI,0,0,0,0,0,0,0,0,0,0,0,0,0,6
2,#GALV,0,0,0,0,0,0,0,0,0,0,0,0,0,3
3,#GFG#,0,0,0,0,0,0,2,0,0,0,0,0,0,0
4,#KQCP,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [3]:
p = bokeh.plotting.figure(
    frame_width = 400,
    frame_height = 300,
    x_axis_label = 'del2_actin_1_brain count',
    y_axis_label = 'del2_actin_1_heart count',
        );

p.circle(
    source = df_blue,
    x = 'del2_actin_1_brain count',
    y = 'del2_actin_1_heart count',
);
bokeh.io.show(p)

In [4]:
df_tidyblue = pd.read_csv('FSRI_Research/complete_blue_5mer.csv')

df_tidyblue.head(5)

Unnamed: 0,Amino_acid,cell_type,replicate,tissue,_count_
0,AAILL,actin,1,brain,3
1,AFGFS,actin,1,brain,98
2,AVKDS,actin,1,brain,1
3,CCDRG,actin,1,brain,5
4,CVRGG,actin,1,brain,1


# TIDY DATA UTILIZATION BELOW:

FIRST, SINCE THERE ARE SO MANY DATA POINTS, I WILL CHECK THAT ALL COUNTS HAVE BEEN TRANSFERRED BY CHECKING THAT ALL TYPES OF COUNTS HAVE BEEN TRANSFERRED AS SEEN BELOW:

In [16]:
check_ID_list = [
    ['actin', '1', 'brain'],
    ['actin', '1', 'heart'],
    ['actin', '2', 'brain'],
    ['actin', '2', 'heart'],
    ['gfap', '1', 'heart'],
    ['gfap', '2', 'brain'],
    ['gfap', '2', 'heart'],
    ['gfap', '3', 'heart'],
    ['hsyn', '1', 'heart'],
    ['hsyn', '2', 'heart'],
    ['hsyn', '3', 'heart'],
    ['tek', '1', 'heart'],
    ['tek', '1', 'kidney'],
    ['tek', '2', 'kidney']
    
]

In [24]:
def complete_data(df):
    """Check the blue-5mer dataframe to make sure all types of tables have been
    added to the complete table."""
    total_check_list = []
    for index, rows in df_tidyblue.iterrows():
        ID_list = [
            df_tidyblue['cell_type'][index],
            df_tidyblue['replicate'][index],
            df_tidyblue['tissue'][index]
                  ]
        total_check_list.append(ID_list in check_ID_list)
    if 'False' in total_check_list:
        return False
    else:
        return True

In [25]:
complete_data(df_tidyblue)

True

## Plots below:

In [34]:
complete_catplot = bokeh_catplot.strip(
    data = df_tidyblue,
    cats = 'tissue',
    val = '_count_',
    x_axis_label = 'Tissue',
    y_axis_label = 'Count',
    frame_height = 300,
    frame_width = 400,
)

bokeh.io.show(complete_catplot)

In [33]:
complete_ecdf = bokeh_catplot.ecdf(
    data = df_tidyblue,
    cats = 'tissue',
    val = '_count_',
    frame_width = 700,
    frame_height = 400,
)

bokeh.io.show(complete_ecdf)

In [35]:
complete_strip = bokeh_catplot.strip(
    data=df_tidyblue,
    cats=['tissue', 'replicate'],
    val='_count_',
    color_column='tissue',
    width=550,
)

bokeh.io.show(complete_strip)

In [59]:
check_ID_list = [
    ['actin', '1', 'brain'],
    ['actin', '1', 'heart'],
    ['actin', '2', 'brain'],
    ['actin', '2', 'heart'],
    ['gfap', '1', 'heart'],
    ['gfap', '2', 'brain'],
    ['gfap', '2', 'heart'],
    ['gfap', '3', 'heart'],
    ['hsyn', '1', 'heart'],
    ['hsyn', '2', 'heart'],
    ['hsyn', '3', 'heart'],
    ['tek', '1', 'heart'],
    ['tek', '1', 'kidney'],
    ['tek', '2', 'kidney']
    
]

In [81]:
brain_benchmark_010 = np.mean(df_brain['_count_'])

brain_benchmark_010

20.463768115942027

In [85]:
heart_benchmark_010 = np.mean(df_heart['_count_'])

heart_benchmark_010

373.82225738396625

In [86]:
kidney_benchmark_010 = np.mean(df_kidney['_count_'])

kidney_benchmark_010

41.107116104868915

## benchmark_010 made as the mean

## benchmark_011 made as 5 standard deviations above the mean

In [100]:
brain_benchmark_011 = (5* np.std(df_brain['_count_'])) + brain_benchmark_010

brain_benchmark_011

235.8038039961753

In [111]:
heart_benchmark_011 = (15 * np.std(df_heart['_count_'])) + heart_benchmark_010

heart_benchmark_011

105016.45066320969

In [136]:
kidney_benchmark_011 = (200 * np.mean(df_kidney['_count_'])) + kidney_benchmark_010

kidney_benchmark_011

8262.530337078653

# Not shown is the benchmark_010 test, however there were far too many rows to continue testing
# Values from each tissue data frame which meet benchmark_011: 

In [127]:
top_brain_seq = df_brain.loc[df_brain['_count_'] > brain_benchmark_011]

top_brain_seq

Unnamed: 0,Amino_acid,cell_type,replicate,tissue,_count_
12,GAGIW,actin,1,brain,282
47,RFTQG,actin,1,brain,378
57,SGSRV,actin,1,brain,384


In [138]:
top_heart_seq = df_heart.loc[df_heart['_count_'] > heart_benchmark_011]

top_heart_seq

Unnamed: 0,Amino_acid,cell_type,replicate,tissue,_count_
870,DGCTK,gfap,1,heart,133137
1533,SNSLF,gfap,1,heart,112289
3026,FASGS,gfap,2,heart,633884
8014,GPAWY,gfap,3,heart,112081


In [137]:
top_kidney_seq = df_kidney.loc[df_kidney['_count_'] > kidney_benchmark_011]

top_kidney_seq

Unnamed: 0,Amino_acid,cell_type,replicate,tissue,_count_
12489,KNIKE,tek,1,kidney,103296
12592,LSADA,tek,1,kidney,15238
12822,RGGAS,tek,1,kidney,70288
12842,RLGPW,tek,1,kidney,35757
12864,RSAGL,tek,1,kidney,27238
12899,SCFPW,tek,1,kidney,15840
13130,TSGDA,tek,1,kidney,35614
17607,LMPIF,tek,2,kidney,22816


# benchmark_012 

In [144]:
brain_benchmark_010

20.463768115942027

In [146]:
np.std(df_brain['_count_'])

43.068007176046656

In [153]:
bottom_heart = df_heart.loc[df_heart['_count_'] <= 1]

bottom_heart

Unnamed: 0,Amino_acid,cell_type,replicate,tissue,_count_
86,#KQCP,actin,1,heart,1
87,AEVRN,actin,1,heart,1
88,AFEGS,actin,1,heart,1
90,AFGGS,actin,1,heart,1
91,AGIPV,actin,1,heart,1
92,AGVVR,actin,1,heart,1
93,ALGRG,actin,1,heart,1
94,ALQFC,actin,1,heart,1
95,AMVGA,actin,1,heart,1
97,ARGGV,actin,1,heart,1


In [157]:
bottom_kidney = df_kidney.loc[df_kidney['_count_'] <= 1]

bottom_kidney

Unnamed: 0,Amino_acid,cell_type,replicate,tissue,_count_
11861,AAPQE,tek,1,kidney,1
11862,AATCC,tek,1,kidney,1
11864,ACLLR,tek,1,kidney,1
11865,ACVKE,tek,1,kidney,1
11867,ADAYG,tek,1,kidney,1
11871,ADVIG,tek,1,kidney,1
11873,AEDYS,tek,1,kidney,1
11875,AGDRA,tek,1,kidney,1
11877,AGENW,tek,1,kidney,1
11879,AGGGR,tek,1,kidney,1


In [141]:
top_heart_seq = df_heart.loc[df_heart['_count_'] < heart_benchmark_011]

top_heart_seq

Unnamed: 0,Amino_acid,cell_type,replicate,tissue,_count_


In [90]:
df_brain = df_tidyblue.loc[df_tidyblue['tissue'] == 'brain']

In [53]:
df_brain_plot = bokeh_catplot.strip(
    data = df_brain,
    cats = ['replicate', 'cell_type'],
    val = '_count_',
    color_column = 'replicate',
    width = 550
)

bokeh.io.show(df_brain_plot)

In [54]:
df_kidney = df_tidyblue.loc[df_tidyblue['tissue'] == 'kidney']

In [55]:
df_heart = df_tidyblue.loc[df_tidyblue['tissue'] == 'heart']

In [57]:
df_kidney_plot = bokeh_catplot.strip(
    data = df_kidney,
    cats = ['replicate', 'cell_type'],
    val = '_count_',
    color_column = 'replicate',
    width = 550
)

bokeh.io.show(df_kidney_plot)

In [58]:
df_heart_plot = bokeh_catplot.strip(
    data = df_heart,
    cats = ['replicate', 'cell_type'],
    val = '_count_',
    color_column = 'replicate',
    width = 550
)

bokeh.io.show(df_heart_plot)

In [None]:
actin_plot = bokeh_catplot.strip(
    df_tidyblue.loc[(df_tidyblue['tissue'] == 'actin')]
)

In [39]:
for item in check_ID_list:
    print(item[2])

brain
heart
brain
heart
heart
brain
heart
heart
heart
heart
heart
heart
kidney
kidney


In [None]:
tissue_list = ['brain', 'heart', 'kidney']
cell_type_list = ['actin', 'hsyn', 'tek', 'gfap']

for item_list in check_ID_list:
    df_tidyblue.loc[(df_tidyblue['tissue'] == item_list[2]) & (df_tidyblue['cell_type'] == item_list[0])]
    

In [74]:
df_brain_ecdf = bokeh_catplot.ecdf(
    data = df_brain,
    cats = None,
    val = '_count_',
    width = 550
)

bokeh.io.show(df_brain_ecdf)

In [78]:
df_kidney_ecdf = bokeh_catplot.ecdf(
    data = df_kidney,
    cats = None,
    val = '_count_',
    width = 550
)

bokeh.io.show(df_kidney_ecdf)

In [79]:
df_heart_ecdf = bokeh_catplot.ecdf(
    data = df_heart,
    cats = None,
    val = '_count_',
    width = 550
)

bokeh.io.show(df_heart_ecdf)