In [1]:
# This notebook is used to check the normality distribution of 
# proteome and transcriptome data of the big data project
# Inport all necessary packages
import pandas as pd
import numpy as np
from scipy.stats import shapiro

In [3]:
# Check normality of the proteome data
# First, read in the proteome data
proteome_data = pd.read_csv("proteome_data_20201007.csv")
sample_name_list = ['S'+str(i) for i in range(1,28)]
#proteome_data_normality_df=pd.DataFrame()
#proteome_data_normality_df
statistics=[]
ps=[]
normals=[]
for name in sample_name_list:
    data_without_na = proteome_data[name].dropna()
    st,p_value=shapiro(data_without_na)
    alpha = 0.05
    if p_value>0.05:
        normality=True
    else:
        normality=False
    statistics.append(st)
    ps.append(p_value)
    normals.append(normality)
proteome_data_normality_df = pd.DataFrame({'sample':sample_name_list,
                                         'stats':statistics,
                                         'p_val':ps,
                                         'normality':normals
                                          })
proteome_data_normality_df.to_excel('proteome_data_normality_check.xlsx')

In [4]:
# Check normality of the transcripteome data
# First, read in the transcripteome data
transcripteome_data = pd.read_csv("Transcripteome20210906.csv")
sample_name_list = ['D025','D05','D1','D15','D2','D25','D3','D35','D4']

statistics=[]
ps=[]
normals=[]
for name in sample_name_list:
    data_without_na = transcripteome_data[name].dropna()
    st,p_value=shapiro(data_without_na)
    alpha = 0.05
    if p_value>0.05:
        normality=True
    else:
        normality=False
    statistics.append(st)
    ps.append(p_value)
    normals.append(normality)
transcripteome_data_normality_df = pd.DataFrame({'sample':sample_name_list,
                                         'stats':statistics,
                                         'p_val':ps,
                                         'normality':normals
                                          })
transcripteome_data_normality_df.to_excel('Transcripteome_data_normality_check.xlsx')



In [57]:
# check normality of individual mRNA across all dilution rates
transcripteome_data_individual_mRNA = pd.read_excel("Supp_data3_transcriptome.xlsx",index_col=0)
transcripteome_data_individual_mRNA = transcripteome_data_individual_mRNA.T
mRNA_name_list = transcripteome_data_individual_mRNA.columns.to_list()
mRNA_name_list = mRNA_name_list[:-1]

statistics=[]
ps=[]
normals=[]
for name in mRNA_name_list:
    data_without_na = transcripteome_data_individual_mRNA[name].dropna()
    st,p_value=shapiro(data_without_na)
    alpha = 0.05
    if p_value>0.05:
        normality=True
    else:
        normality=False
    statistics.append(st)
    ps.append(p_value)
    normals.append(normality)
individual_mRNA_data_normality_df = pd.DataFrame({'sample':mRNA_name_list,
                                         'stats':statistics,
                                         'p_val':ps,
                                         'normality':normals
                                          })
individual_mRNA_data_normality_df.to_csv("Individual_mRNA_normality.csv")

In [78]:
# check normality of individual protein across all dilution rates
protein_data_individual_gene = pd.read_csv("proteome_data_20201007.csv",index_col=3)
protein_data_individual_gene = protein_data_individual_gene.T.iloc[4:-1,:]
protein_name_list = protein_data_individual_gene.columns.to_list()

valid_protein_names=[]
statistics=[]
ps=[]
normals=[]
for name in protein_name_list:
    data_without_na = protein_data_individual_gene[name].dropna()
    if len(data_without_na)<=3:
        continue
    st,p_value=shapiro(data_without_na)
    alpha = 0.05
    if p_value>0.05:
        normality=True
    else:
        normality=False
    valid_protein_names.append(name)
    statistics.append(st)
    ps.append(p_value)
    normals.append(normality)
individual_protein_data_normality_df = pd.DataFrame({'sample':valid_protein_names,
                                         'stats':statistics,
                                         'p_val':ps,
                                         'normality':normals
                                          })
individual_protein_data_normality_df.to_csv("individual_protein_normality.csv")

In [86]:
mRNA_data = pd.read_excel("pvsm.xlsx",sheet_name='mRNA',index_col=0)
protein_data = pd.read_excel("pvsm.xlsx",sheet_name="Protome",index_col=0)
protein_data = protein_data.iloc[:,1:10]
tmp_colnames = protein_data.columns
protein_data.columns = ['D'+name[2:] if 'C_' in name else name for name in tmp_colnames]
protein_data_T = protein_data.T
mRNA_data_T = mRNA_data.T
# Check normality of genes with high spearman correlation coefficient between protein and mRNA
gene_name_list = ['YEL039C','YGR088W','YCR031C','YLR442C','YIR021W','YIL033C',
                  'YNL302C','YLR438W','YFL036W','YDR256C','YLR393W','YPL119C',
                  'YCL044C','YCR010C','YGR087C','YNL312W','YMR267W','YMR152W',
                  'YGL169W','YGR214W','YKL135C','YKL026C','YKR049C','YKR076W',
                  'YBR222C','YBR185C','YJR103W','YHR197W','YEL055C','YER145C',
                  'YIL057C','YJL053W','YIL124W','YIR036C','YIR037W','YPL171C',
                  'YDR195W','YJL054W','YJR074W','YJR134C','YOL033W','YML056C',
                  'YMR175W','YGL236C','YGL228W','YGR248W','YNR040W','YNL274C',
                  'YNL141W','YNL081C','YNL014W','YAL054C','YKR007W','YML131W',
                  'YML127W','YMR278W','YHL007C','YMR298W','YMR090W','YMR107W',
                  'YMR110C','YLR345W','YLR357W','YDR330W','YDL072C','YOR289W',
                  'YDR056C','YLL023C','YLR093C','YDL204W','YOL124C']
#check whether these gene's protein or mRNA distribution is normal
valid_p_names=[]
p_statistics=[]
p_ps=[]
p_normals=[]
valid_m_names=[]
m_statistics=[]
m_ps=[]
m_normals=[]

for name in gene_name_list:
    data_without_na = mRNA_data_T[name].dropna()
    st,p_value=shapiro(data_without_na)
    alpha = 0.01
    if p_value>alpha:
        normality=True
    else:
        normality=False
    m_statistics.append(st)
    m_ps.append(p_value)
    m_normals.append(normality)
    
    data_without_na = protein_data_T[name].dropna()
    if len(data_without_na)<=3:
        continue
    st,p_value=shapiro(data_without_na)
    alpha = 0.01
    if p_value>alpha:
        normality=True
    else:
        normality=False
    valid_p_names.append(name)
    p_statistics.append(st)
    p_ps.append(p_value)
    p_normals.append(normality)   
        
highcorr_p_data_normality_df = pd.DataFrame({'sample':valid_p_names,
                                         'stats':p_statistics,
                                         'p_val':p_ps,
                                         'normality':p_normals
                                          })
highcorr_m_data_normality_df = pd.DataFrame({'sample':gene_name_list,
                                         'stats':m_statistics,
                                         'p_val':m_ps,
                                         'normality':m_normals
                                          })

In [85]:
highcorr_m_data_normality_df.to_csv('highcorr_mRNA_normality.csv')
highcorr_p_data_normality_df.to_csv('highcorr_protein_normality.csv')