In [2]:
import pandas as pd
from pathlib import Path
import uuid
import json

# Somascan Data Privacy Cleaning

## Description

Short notebook to retrieve raw somascan data, remove sensible information, and generate a final working dataset of measured protein levels plus metadata as unique subject identifier, visit, sex, race, and age.

### Setting Folders

In [11]:
DATAPATH = Path("/home/grionan1/project/tmds3_mentor/data/raw")
somascan_data = Path("SOMA_QCed_Ready_4_Use_postComBat.csv")

### Read Somascan Dataset

In [12]:
somascan_df = pd.read_csv(DATAPATH.joinpath(somascan_data), low_memory=False)

### Replace Subject ID with Random Code

In [13]:
# create random codes
usub_mapper = { usubjid : uuid.uuid4().hex[:8] for usubjid in somascan_df.USUBJID.unique().tolist() }
# replace the original subject id
somascan_df['USUBJID'] = somascan_df.USUBJID.replace(usub_mapper)

### Save Mapper

Save the usub_mapper as json file

In [14]:
with open(DATAPATH.joinpath("usub_mapper.json"), 'w') as outfile:
    json.dump(usub_mapper, outfile)

### Create Final Datasete

Raw aptamers and relevant metadata unified to new data frame.

In [15]:
# select aptamers only columns: aptamer columns tag is 'anti_'
aptamers = somascan_df.filter(regex='anti', axis=1).copy()
# subset metadata and add aptamers
final_dataset = somascan_df[ ['USUBJID', 'VISIT', 'SEX', 'RACEOR', 'AGEDRV'] + aptamers.columns.tolist()].copy()

### Save Final Dataframe

In [16]:
final_dataset.to_csv(DATAPATH.joinpath("working_dataset.csv"), index=False)