In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [3]:
data_folder = "../data"

In [30]:
proteins = pd.read_csv(os.path.join(data_folder,'Combined_QuantileNorm_PairAvg_IgG_532nm_Green.tsv'), sep = '\t')
proteins = proteins[proteins['Caveat'] != True]
proteins = proteins.drop(['Block', 'Row', 'Column', 'ID', 'Caveat'], axis = 1)
proteins = proteins.transpose()
new_header = proteins.iloc[0] #grab the first row for the header
proteins = proteins[1:] #take the data less the header row
proteins.columns = new_header #set the header row as the df header
proteins.reset_index(inplace=True)
proteins = proteins.rename(columns = {'index':'Name'})
proteins

Name,Name.1,RAB8A,NAGK,ERBB3,NME2,FNIP1,PRKRIR_frag,CAMK4,STAC3,RAB14,...,human IgG2,human IgA,human IgA1,human IgG3,Cy3+Cy5,human IgA2,human IgD,human IgE,human IgG4,human IgG
0,INCOV002,908.9,2896.023077,1410.869231,3288.161538,573.938462,603.407692,2572.338462,611.107692,5744.761538,...,1825.761538,1833.792308,2399.6,1620.207692,3769.176923,4085.476923,2194.223077,8890.007692,10617.63846,8131.246154
1,INCOV005,1033.176923,2824.315385,1616.269231,2347.707692,706.715385,885.865385,1216.538462,518.888461,1823.669231,...,3554.223077,3066.046154,4191.307692,3004.653846,4867.561538,5550.330769,3808.576923,9769.292308,12375.93846,8183.530769
2,INCOV007,839.776923,3252.215385,1647.607692,4085.476923,565.961538,611.338461,1178.361538,435.707692,4841.846154,...,1319.684615,1241.876923,1507.442308,1059.446154,2584.284615,2858.280769,1693.576923,3691.253846,5325.938462,3519.138462
3,INCOV009,798.334615,4411.215385,2731.246154,11921.66154,685.861539,585.084615,794.461538,344.115385,2478.992308,...,413.957692,363.523077,495.946154,369.976923,664.953846,903.884615,345.903846,1446.084615,1864.684615,1073.546154
4,INCOV010,898.815385,2412.823077,1387.069231,3704.019231,667.65,759.407692,2717.746154,542.661539,2561.057692,...,1589.2,1489.830769,2046.476923,1481.823077,2758.861538,3067.776923,1892.584615,4423.476923,5897.346154,3756.684615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,INCOV188,506.919231,1875.853846,784.926923,2552.953846,361.765385,375.280769,1245.719231,289.646154,2014.284615,...,4411.215385,3939.823077,5051.769231,4442.253846,6400.592308,6777.911538,4873.761538,10078.88462,9309.992308,6832.138462
61,INCOV194,413.384615,1910.830769,709.142308,3631.653846,376.684615,383.196154,2348.338462,337.930769,1457.338462,...,3638.992308,3586.223077,5483.807692,4070.769231,7868.269231,7572.623077,4969.515385,13050.54615,12483.8,8406.284615
62,INCOV198,714.188462,2072.330769,1645.692308,4029.907692,835.657692,684.284615,540.765385,383.519231,1733.976923,...,2954.203846,3179.784615,4767.846154,2636.115385,5098.780769,9037.730769,4797.923077,14273.93077,13681.96923,11246.32308
63,INCOV201,872.038462,2695.211538,1161.592308,3914.007692,1071.061538,784.1,5289.753846,805.092308,1937.438462,...,3118.719231,3537.269231,4262.869231,3868.776923,5599.296154,5843.634615,4414.703846,7720.684615,7530.130769,5859.976923


In [57]:
symptoms = pd.read_excel(os.path.join(data_folder, "FullsymptomWithControl-082722.xlsx"))
symptoms = symptoms.drop(['age', 'No. of Pre-Infections', 'No. of Antibodies', '>1 Antibody'], axis=  1)
symptoms = symptoms[ (symptoms["LongCovid"] == 1) | ( (symptoms['LongCovid']==0) & (symptoms['No. Of Symptoms'] == 0))]
names = symptoms['SampleID']
symptoms = symptoms.iloc[:, 1:5].drop(["Gender1"], axis = 1)
header = symptoms.columns


# mean impute the ages, hopefully missing at random
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(symptoms)
symptoms = imp.transform(symptoms)
symptoms = pd.DataFrame(symptoms, columns = header)
symptoms['age'] = np.floor(2022-symptoms['DOB'])
symptoms = symptoms.drop(['DOB'], axis = 1)

cols = symptoms.columns.tolist()
cols = cols[1:] + cols[:1]
cols

symptoms = symptoms[cols]
symptoms = pd.concat([names.reset_index(drop = True), symptoms.reset_index(drop=True)], axis=1)
symptoms

Unnamed: 0,SampleID,Gender,age,LongCovid
0,INCOV002,1.0,41.0,1.0
1,INCOV005,1.0,78.0,1.0
2,INCOV007,1.0,59.0,1.0
3,INCOV009,0.0,68.0,1.0
4,INCOV010,0.0,42.0,1.0
...,...,...,...,...
57,INCOV188,1.0,55.0,1.0
58,INCOV194,1.0,55.0,0.0
59,INCOV198,0.0,55.0,1.0
60,INCOV201,0.0,55.0,1.0


In [63]:
# join symptoms and proteins
joined = proteins.merge(symptoms, how = 'right', left_on = "Name", right_on = 'SampleID').drop(['Name',  'SampleID'], axis = 1)
joined

Unnamed: 0,RAB8A,NAGK,ERBB3,NME2,FNIP1,PRKRIR_frag,CAMK4,STAC3,RAB14,MATK,...,human IgG3,Cy3+Cy5,human IgA2,human IgD,human IgE,human IgG4,human IgG,Gender,age,LongCovid
0,908.9,2896.023077,1410.869231,3288.161538,573.938462,603.407692,2572.338462,611.107692,5744.761538,854.615385,...,1620.207692,3769.176923,4085.476923,2194.223077,8890.007692,10617.63846,8131.246154,1.0,41.0,1.0
1,1033.176923,2824.315385,1616.269231,2347.707692,706.715385,885.865385,1216.538462,518.888461,1823.669231,926.6,...,3004.653846,4867.561538,5550.330769,3808.576923,9769.292308,12375.93846,8183.530769,1.0,78.0,1.0
2,839.776923,3252.215385,1647.607692,4085.476923,565.961538,611.338461,1178.361538,435.707692,4841.846154,876.6,...,1059.446154,2584.284615,2858.280769,1693.576923,3691.253846,5325.938462,3519.138462,1.0,59.0,1.0
3,798.334615,4411.215385,2731.246154,11921.66154,685.861539,585.084615,794.461538,344.115385,2478.992308,1666.6,...,369.976923,664.953846,903.884615,345.903846,1446.084615,1864.684615,1073.546154,0.0,68.0,1.0
4,898.815385,2412.823077,1387.069231,3704.019231,667.65,759.407692,2717.746154,542.661539,2561.057692,1006.153846,...,1481.823077,2758.861538,3067.776923,1892.584615,4423.476923,5897.346154,3756.684615,0.0,42.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,506.919231,1875.853846,784.926923,2552.953846,361.765385,375.280769,1245.719231,289.646154,2014.284615,657.411539,...,4442.253846,6400.592308,6777.911538,4873.761538,10078.88462,9309.992308,6832.138462,1.0,55.0,1.0
58,413.384615,1910.830769,709.142308,3631.653846,376.684615,383.196154,2348.338462,337.930769,1457.338462,805.353846,...,4070.769231,7868.269231,7572.623077,4969.515385,13050.54615,12483.8,8406.284615,1.0,55.0,0.0
59,714.188462,2072.330769,1645.692308,4029.907692,835.657692,684.284615,540.765385,383.519231,1733.976923,1448.926923,...,2636.115385,5098.780769,9037.730769,4797.923077,14273.93077,13681.96923,11246.32308,0.0,55.0,1.0
60,872.038462,2695.211538,1161.592308,3914.007692,1071.061538,784.1,5289.753846,805.092308,1937.438462,1118.592308,...,3868.776923,5599.296154,5843.634615,4414.703846,7720.684615,7530.130769,5859.976923,0.0,55.0,1.0


In [64]:
# save to csv
joined.to_csv(os.path.join(data_folder, "proteins_longcovid_target_joined.csv"), index = False)

In [70]:
# split into train and test set 80/20
X_train, X_test, y_train, y_test = train_test_split(joined.iloc[:,:-1], joined.iloc[:,-1], test_size= 0.2, shuffle = True)

In [76]:
# concat and write
train = pd.concat([X_train.reset_index(drop = True), y_train.reset_index(drop = True)], axis=  1)
test = pd.concat([X_test.reset_index(drop = True), y_test.reset_index(drop = True)], axis=  1)
train.to_csv(os.path.join(data_folder, "proteins_longcovid_target_train.csv"), index = False)
test.to_csv(os.path.join(data_folder, "proteins_longcovid_target_test.csv"), index = False)

In [7]:
train = pd.read_csv(os.path.join(data_folder, "proteins_longcovid_target_train.csv"))
train

Unnamed: 0,RAB8A,NAGK,ERBB3,NME2,FNIP1,PRKRIR_frag,CAMK4,STAC3,RAB14,MATK,...,human IgG3,Cy3+Cy5,human IgA2,human IgD,human IgE,human IgG4,human IgG,Gender,age,LongCovid
0,572.484615,1375.765385,872.038462,1587.807692,448.792308,502.138462,1462.915385,340.346154,1743.75,717.5,...,1405.292308,2166.726923,2274.569231,1546.8,6261.884615,8353.261538,4001.107692,0.0,29.0,1.0
1,977.292308,2514.242308,1273.269231,2329.507692,1348.411538,750.453846,1442.061538,537.030769,2142.546154,1036.511538,...,3084.723077,3932.076923,4208.0,2821.923077,8719.753846,12483.8,9430.546154,1.0,63.0,1.0
2,924.996154,2431.492308,2191.438462,4511.442308,1260.323077,1048.323077,476.3,638.8,2313.307692,2427.046154,...,1772.034615,4832.9,5464.846154,3145.176923,9309.992308,10398.64615,7358.323077,0.0,64.0,1.0
3,587.234615,1421.884615,713.823077,1865.042308,409.180769,496.3,1151.657692,389.492308,1519.438462,746.484615,...,1703.073077,3296.996154,3955.823077,2069.811538,7301.807692,8458.661538,5873.892308,0.0,44.0,1.0
4,839.776923,3252.215385,1647.607692,4085.476923,565.961538,611.338461,1178.361538,435.707692,4841.846154,876.6,...,1059.446154,2584.284615,2858.280769,1693.576923,3691.253846,5325.938462,3519.138462,1.0,59.0,1.0
5,1087.646154,2914.292308,1706.307692,3673.130769,635.561538,769.776923,2233.630769,564.338461,4175.715385,1128.592308,...,2339.138462,4262.869231,5179.961538,2828.184615,9162.9,12325.63846,10742.6,1.0,39.0,1.0
6,1033.176923,2824.315385,1616.269231,2347.707692,706.715385,885.865385,1216.538462,518.888461,1823.669231,926.6,...,3004.653846,4867.561538,5550.330769,3808.576923,9769.292308,12375.93846,8183.530769,1.0,78.0,1.0
7,443.484615,1210.746154,772.373077,1494.126923,425.230769,332.538461,821.646154,316.130769,848.607692,1007.923077,...,5511.188462,6527.076923,6783.730769,5409.173077,8678.661538,11316.49231,6388.892308,0.0,66.0,1.0
8,1199.192308,3391.853846,1743.242308,4425.769231,799.692308,909.626923,1779.753846,636.853846,2254.446154,1265.415385,...,3903.342308,5948.876923,6594.146154,4939.269231,9357.0,12375.93846,10137.66154,1.0,40.0,1.0
9,931.573077,3097.665385,1626.561538,3699.661538,724.923077,724.923077,1395.846154,572.215385,2181.623077,1250.896154,...,2368.176923,3514.253846,3326.007692,2054.165385,5110.292308,8420.230769,6947.088462,0.0,82.0,1.0


In [5]:
# remake as train/val
X_train, X_val, y_train, y_val = train_test_split(train.iloc[:,:-1], train.iloc[:,-1], test_size= 0.2, shuffle = True)

In [6]:
# concat and write
train = pd.concat([X_train.reset_index(drop = True), y_train.reset_index(drop = True)], axis=  1)
val = pd.concat([X_val.reset_index(drop = True), y_val.reset_index(drop = True)], axis=  1)
train.to_csv(os.path.join(data_folder, "proteins_longcovid_target_metatrain.csv"), index = False)
val.to_csv(os.path.join(data_folder, "proteins_longcovid_target_metaval.csv"), index = False)