In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
file = 'data/raw/FC2018_protein_quant.xls'

In [3]:
# load protein abundance data for gastric diseases as dataframe
gastric_df = pd.read_excel(file)

In [4]:
# use first row as heading 
new_header = gastric_df.iloc[0]
# remove first row from dataframe
gastric_df = gastric_df[1:]
# assign as column names
gastric_df.columns = new_header

In [5]:
# manually determine which columns irrelevant, indices to remove
to_del = [0] + list(range(3, 10)) + list(range(34, 109))
# remove raw, un-logged values "Corr"
to_del = to_del + list(range(10,22))

gastric_df = gastric_df.drop([gastric_df.columns[x] for x in to_del],  axis='columns')

In [7]:
# convert X, NaN to 1, 0
gastric_df = gastric_df.fillna(0)
gastric_df = gastric_df.replace(['X'],1)
gene_list = gastric_df.Gene
ID_list = gastric_df.ID
gastric_df = gastric_df.drop(['Gene','ID'],  axis='columns')

gastric_df.to_csv('data/processed/gastric_clean_unst.csv')
ID_list.to_csv('data/processed/gastric_ID_list.csv')
gastric_df

Unnamed: 0,NC1 Log2,NC2 Log2,NC3 Log2,NC4 Log2,GC1 Log2,GC2 Log2,GC3 Log2,GC4 Log2,PL1 Log2,PL2 Log2,...,RNA degradation,Mismatch repair,Stomach specific,Mitochondrion,Cytoplasm,Extracellular exosome,Membrane,Nucleus,Endoplasmic reticulum,Golgi apparatus
1,14.141872,14.107314,13.862564,13.757528,14.283957,13.844870,14.446062,14.238953,14.157263,13.412358,...,0,0,0,1,1,1,1,1,0,0
2,18.189801,18.202496,18.049074,18.069828,18.372977,18.240611,18.414903,18.385577,18.464838,18.382029,...,0,0,0,0,1,1,0,1,0,0
3,15.370743,15.220339,14.877490,14.877915,15.250045,15.182754,15.617631,15.404629,15.313874,14.679077,...,0,0,0,0,0,1,0,0,0,0
4,20.695419,20.764910,20.609847,20.648786,20.419828,20.184950,20.393157,20.341481,20.694539,20.345622,...,0,0,0,0,1,0,0,1,0,0
5,19.076272,19.118719,19.159973,19.142240,18.936441,18.792552,18.894393,18.742887,18.848415,18.776311,...,0,0,0,0,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3910,12.901462,13.211006,13.401413,13.283953,13.499713,13.480609,13.207380,13.471890,13.221693,13.314391,...,0,0,0,0,1,0,0,1,0,0
3911,14.153375,14.074614,14.294366,14.275029,14.141097,14.475560,14.173025,14.205150,14.154766,14.456927,...,0,0,0,0,0,0,0,0,0,0
3912,14.537214,14.515092,14.198650,14.548186,13.307382,13.813228,13.612146,13.882917,14.146743,14.460718,...,0,0,0,0,1,0,0,1,0,0
3913,13.652200,13.503825,12.866414,13.433816,13.270009,12.735903,13.123058,13.411304,13.479893,12.886742,...,0,0,0,0,0,0,0,1,0,0


In [22]:
# transpose, convert to patient sample per row
patient_gastric_df = gastric_df.copy().T

# remove non-data rows
to_del = list(range(12,46))
patient_gastric_df = patient_gastric_df.drop(patient_gastric_df.index[to_del])

patient_gastric_df.to_csv('data/processed/patient_gastric.csv')
patient_gastric_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,3905,3906,3907,3908,3909,3910,3911,3912,3913,3914
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NC1 Log2,14.141872,18.189801,15.370743,20.695419,19.076272,17.123919,17.764218,16.188378,14.182892,14.633818,...,19.991036,17.797028,17.818882,12.053686,18.14145,12.901462,14.153375,14.537214,13.6522,16.352512
NC2 Log2,14.107314,18.202496,15.220339,20.76491,19.118719,17.200443,17.776666,16.313253,14.209927,14.697544,...,20.026804,17.862611,17.746054,11.264267,18.218309,13.211006,14.074614,14.515092,13.503825,16.393447
NC3 Log2,13.862564,18.049074,14.87749,20.609847,19.159973,17.49763,17.692788,16.479957,13.840476,14.573115,...,20.047174,17.761044,17.755131,12.500099,18.095022,13.401413,14.294366,14.19865,12.866414,16.436029
NC4 Log2,13.757528,18.069828,14.877915,20.648786,19.14224,17.435956,17.636674,16.473456,13.860506,14.771464,...,20.020296,17.752938,17.723194,12.528126,18.152042,13.283953,14.275029,14.548186,13.433816,16.495736
GC1 Log2,14.283957,18.372977,15.250045,20.419828,18.936441,17.223402,18.612105,16.191802,13.787683,14.437127,...,20.151233,17.429235,17.67869,10.795212,17.867372,13.499713,14.141097,13.307382,13.270009,16.188035
GC2 Log2,13.84487,18.240611,15.182754,20.18495,18.792552,17.594148,18.522054,16.548121,13.717444,14.746656,...,20.210639,17.27345,17.691193,13.195557,17.831019,13.480609,14.47556,13.813228,12.735903,16.363108
GC3 Log2,14.446062,18.414903,15.617631,20.393157,18.894393,17.30359,18.659673,16.374249,13.90367,14.534994,...,20.235168,17.38021,17.793484,12.078803,17.950243,13.20738,14.173025,13.612146,13.123058,16.35679
GC4 Log2,14.238953,18.385577,15.404629,20.341481,18.742887,17.186565,18.633122,16.189146,13.915617,14.693789,...,20.143621,17.369401,17.713564,12.37826,17.727023,13.47189,14.20515,13.882917,13.411304,16.357408
PL1 Log2,14.157263,18.464838,15.313874,20.694539,18.848415,17.193123,18.207764,16.251555,13.941616,14.651519,...,20.37442,17.941426,17.88006,11.47029,18.051019,13.221693,14.154766,14.146743,13.479893,16.437476
PL2 Log2,13.412358,18.382029,14.679077,20.345622,18.776311,17.732135,18.133969,16.610072,13.794923,15.039762,...,20.509983,17.718124,17.723361,13.456666,17.954646,13.314391,14.456927,14.460718,12.886742,16.629294
