# Sample QC for GWAS analysis


## Jupyterlab app details (launch configuration)

Recommended configuration
- runtime: < 10 min
- cluster configuration: `Spark cluster`
- number of nodes: 2
- recommended instance: `mem1_ssd1_v2_x16`
- cost: < £0.09

1. Import libraries and initialize Spark connection.

In [1]:
import os
import pyspark.pandas as ks
import dxpy
import dxdata
import pandas as pd
import pyspark
import re

# Set the environment variable
os.environ['PYARROW_IGNORE_TIMEZONE'] = '1'

# Initialize Spark
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)



2. Secify whole exome data (WES) directory, exome field ID, these variables will depend on WES release (e.g. 200K, 300K or 450K) and output directory.

In [2]:
exome_folder = 'Population level exome OQFE variants, PLINK format - final release'
exome_field_id = '23158'
output_dir = '/'

In [3]:
imputation_folder = 'Imputation from genotype (GEL)'
imputation_field_id = '21008'

3. Load daatset description and select entity containing phenotypic data.

In [17]:
# Automatically discover cohort IDs by name
control_cohort_obj = dxpy.find_one_data_object(
    typename="CohortBrowser",
    name="CPSP_NEW_CONTROLS",
    folder="/Cohorts",
    name_mode="exact"
)
control_cohort_id = control_cohort_obj["id"]

case_cohort_obj = dxpy.find_one_data_object(
    typename="CohortBrowser",
    name="CPSP_NEW_CASES",
    folder="/Cohorts",
    name_mode="exact"
)
case_cohort_id = case_cohort_obj["id"]

# Load the cohorts
control_cohort = dxdata.load_cohort(id=control_cohort_id)
case_cohort = dxdata.load_cohort(id=case_cohort_id)


In [18]:
# Automatically discover dispensed dataset ID and load the dataset
dispensed_dataset = dxpy.find_one_data_object(
    typename="Dataset", 
    name="app*.dataset", 
    folder="/", 
    name_mode="glob"
)
dispensed_dataset_id = dispensed_dataset["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [6]:
participant = dataset['participant']

4. Load cohorts that were created in cohort browser.

In [7]:
""""
case = dxdata.load_cohort(id="record-J09vyy0JBBbK3k6YFB8BJ1qP")
cont = dxdata.load_cohort(id="record-J09vykQJBBbJ1b7KZX8qvf5Y")

""""

In [20]:
case = case_cohort
cont = control_cohort

5. Specify fields ID to retrieve, get corresponding UKB RAP field names and print description table.

In [9]:
field_ids = ['31', '22001', '22006', '22019', '34', '21022', '29100', '29011','23104', '22020', '2966',
    '22009', '41270']

In [21]:
def fields_for_id(field_id):
    '''Collect field objects from UKB RAP based on field ID.'''
    field_id = str(field_id)
    fields = list(participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id)))

    # Sort numerically if PCA
    if field_id == '22009':
        fields = sorted(
            [f for f in fields if re.search(r'a(\d+)', f.name)],
            key=lambda f: int(re.search(r'a(\d+)', f.name).group(1))
        )
        return fields[:10]
    # Otherwise, return only first unless it's 2966
    elif field_id != '2966' and len(fields) > 1:
        return [fields[0]]
    else:
        return fields


In [22]:
fields = []
for f in field_ids:
    fs = fields_for_id(f)
    if f == '22009':
        fields.extend(fs)  # keep all 10 PCs
    else:
        fields.append(fs[0])  # just the first field
fields += [participant.find_field(name='p20160_i0'), participant.find_field(name='eid')]

field_description = pd.DataFrame({
    'Field': [f.name for f in fields],
    'Title': [f.title for f in fields],
    'Coding': [f.coding.codes if f.coding is not None else '' for f in fields ]
})

field_description

Unnamed: 0,Field,Title,Coding
0,p31,Sex,"{'0': 'Female', '1': 'Male'}"
1,p22001,Genetic sex,"{'0': 'Female', '1': 'Male'}"
2,p22006,Genetic ethnic grouping,{'1': 'Caucasian'}
3,p22019,Sex chromosome aneuploidy,{'1': 'Yes'}
4,p34,Year of birth,
5,p21022,Age at recruitment,
6,p29100,"Ever had known person concerned about, or reco...","{'-3': 'Prefer not to answer', '0': 'No', '1':..."
7,p29011,Ever had prolonged feelings of sadness or depr...,"{'1': 'Yes', '0': 'No', '-3': 'Prefer not to a..."
8,p23104_i0,Body mass index (BMI) | Instance 0,
9,p22020,Used in genetic principal components,{'1': 'Yes'}


6. Retrieve data for both cohorts.

In [None]:
# For case data
case_df = participant.retrieve_fields(fields=fields, filter_sql=case.sql, engine=dxdata.connect()).to_pandas_on_spark()

# For control data
cont_df = participant.retrieve_fields(fields=fields, filter_sql=cont.sql, engine=dxdata.connect(
    dialect="hive+pyspark", 
    connect_args={
        'config': {
            'spark.kryoserializer.buffer.max': '256m', 
            'spark.sql.autoBroadcastJoinThreshold': '-1'
        }
    }
)).to_pandas_on_spark()


  self._context = ssl.SSLContext(ssl_version)


7. Create phenotype variable and concatenate cohorts into one dataframe.

In [13]:
case_df['chronic_pain_cc'] = 1
cont_df['chronic_pain_cc'] = 0


In [14]:
df = ks.concat([case_df, cont_df])

In [None]:
df.shape

In [16]:
df.chronic_pain_cc.value_counts()

[root] ERROR: KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/cluster/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/cluster/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

8. QC samples based on several conditions.

In [56]:
import numpy as np
from scipy.stats import chi2_contingency

# Start with original count
original_count = len(df)
original_cases = df[df.chronic_pain_cc == 1].shape[0]
original_controls = df[df.chronic_pain_cc == 0].shape[0]
print(f"Original participants: {original_count} (Cases: {original_cases}, Controls: {original_controls})")

# Function to track filtering impact on cases and controls with chi-square test
def track_filter_impact(filter_name, filter_condition, previous_filter=None):
    if previous_filter is not None:
        combined_filter = previous_filter & filter_condition
    else:
        combined_filter = filter_condition
    
    filtered_df = df[combined_filter]
    total_count = len(filtered_df)
    
    cases_count = filtered_df[filtered_df.chronic_pain_cc == 1].shape[0]
    controls_count = filtered_df[filtered_df.chronic_pain_cc == 0].shape[0]
    
    if previous_filter is not None:
        previous_df = df[previous_filter]
        previous_total = len(previous_df)
        previous_cases = previous_df[previous_df.chronic_pain_cc == 1].shape[0]
        previous_controls = previous_df[previous_df.chronic_pain_cc == 0].shape[0]
        
        cases_removed = previous_cases - cases_count
        controls_removed = previous_controls - controls_count
        total_removed = previous_total - total_count
        
        cases_percent_removed = cases_removed / previous_cases if previous_cases > 0 else 0
        controls_percent_removed = controls_removed / previous_controls if previous_controls > 0 else 0
        total_percent_removed = total_removed / previous_total if previous_total > 0 else 0
        
        print(f"\n--- After {filter_name} ---")
        print(f"Total: {total_count} (Removed: {total_removed}, {total_percent_removed:.2%})")
        print(f"Cases: {cases_count} (Removed: {cases_removed}, {cases_percent_removed:.2%})")
        print(f"Controls: {controls_count} (Removed: {controls_removed}, {controls_percent_removed:.2%})")
        
        # Chi-square test for this step only
        if cases_removed > 0 or controls_removed > 0:
            contingency = [
                [cases_removed, controls_removed],
                [cases_count, controls_count]
            ]
            
            # Check if expected frequencies are all >= 5
            chi2, p, dof, expected = chi2_contingency(contingency)
            
            min_expected = np.min(expected)
            if min_expected < 5:
                print(f"Warning: Chi-square may not be valid (min expected frequency: {min_expected:.2f} < 5)")
            
            print(f"Chi-square test for this step:")
            print(f"Chi2 value: {chi2:.4f}")
            print(f"p-value: {p:.6f}")
            print(f"{'*' * 3 if p < 0.001 else '*' * 2 if p < 0.01 else '*' if p < 0.05 else 'ns'} {'Significant difference' if p < 0.05 else 'No significant difference'} in filtering effect")
    else:
        print(f"\n--- After {filter_name} ---")
        print(f"Total: {total_count}")
        print(f"Cases: {cases_count}")
        print(f"Controls: {controls_count}")
    
    # Also perform chi-square against the original dataset
    if original_cases > 0 and original_controls > 0:
        cases_removed_from_original = original_cases - cases_count
        controls_removed_from_original = original_controls - controls_count
        
        contingency = [
            [cases_removed_from_original, controls_removed_from_original],
            [cases_count, controls_count]
        ]
        
        chi2, p, dof, expected = chi2_contingency(contingency)
        
        min_expected = np.min(expected)
        if min_expected < 5:
            print(f"Warning: Cumulative chi-square may not be valid (min expected frequency: {min_expected:.2f} < 5)")
        
        print(f"Cumulative chi-square test (compared to original):")
        print(f"Chi2 value: {chi2:.4f}")
        print(f"p-value: {p:.6f}")
        print(f"{'*' * 3 if p < 0.001 else '*' * 2 if p < 0.01 else '*' if p < 0.05 else 'ns'} {'Significant difference' if p < 0.05 else 'No significant difference'} in cumulative filtering effect")
    
    return combined_filter

# Apply each filter and track impact
filter1 = track_filter_impact("filtering for same sex and genetic sex", 
                             df['p31'] == df['p22001'])

filter2 = track_filter_impact("filtering for Caucasian ethnic grouping", 
                             df['p22006'] == 1, 
                             filter1)

filter3 = track_filter_impact("filtering for no sex chromosome aneuploidy", 
                             df['p22019'].isnull(), 
                             filter2)

filter4 = track_filter_impact("filtering for participants were used to calculate PCA (only non-relatives were included)", 
                              df['p22020'] == 1, filter3)


# Final filtered dataset
df_qced = df[filter4]

# Overall summary
print("\n=== OVERALL SUMMARY ===")
final_cases = df_qced[df_qced.chronic_pain_cc == 1].shape[0]
final_controls = df_qced[df_qced.chronic_pain_cc == 0].shape[0]

cases_removed_total = original_cases - final_cases
controls_removed_total = original_controls - final_controls
total_removed = original_count - len(df_qced)

cases_percent_remaining = final_cases / original_cases
controls_percent_remaining = final_controls / original_controls
total_percent_remaining = len(df_qced) / original_count

print(f"Original participants: {original_count} (Cases: {original_cases}, Controls: {original_controls})")
print(f"Final participants: {len(df_qced)} (Cases: {final_cases}, Controls: {final_controls})")
print(f"Total removed: {total_removed} ({1-total_percent_remaining:.2%})")
print(f"Cases removed: {cases_removed_total} ({1-cases_percent_remaining:.2%})")
print(f"Controls removed: {controls_removed_total} ({1-controls_percent_remaining:.2%})")

# Case-control ratio before and after
original_ratio = original_cases / original_controls if original_controls > 0 else float('inf')
final_ratio = final_cases / final_controls if final_controls > 0 else float('inf')
print(f"\nCase-to-control ratio before: 1:{1/original_ratio:.4f}")
print(f"Case-to-control ratio after: 1:{1/final_ratio:.4f}")

Original participants: 111243 (Cases: 5696, Controls: 105547)

--- After filtering for same sex and genetic sex ---
Total: 108827
Cases: 5552
Controls: 103275
Cumulative chi-square test (compared to original):
Chi2 value: 3.4119
p-value: 0.064729
ns No significant difference in cumulative filtering effect

--- After filtering for Caucasian ethnic grouping ---
Total: 93682 (Removed: 15145, 13.92%)
Cases: 4808 (Removed: 744, 13.40%)
Controls: 88874 (Removed: 14401, 13.94%)
Chi-square test for this step:
Chi2 value: 1.2553
p-value: 0.262538
ns No significant difference in filtering effect
Cumulative chi-square test (compared to original):
Chi2 value: 0.1588
p-value: 0.690307
ns No significant difference in cumulative filtering effect

--- After filtering for no sex chromosome aneuploidy ---
Total: 93608 (Removed: 74, 0.08%)
Cases: 4803 (Removed: 5, 0.10%)
Controls: 88805 (Removed: 69, 0.08%)
Chi-square test for this step:
Chi2 value: 0.1369
p-value: 0.711345
ns No significant difference i

In [14]:
"""
# Apply filters based on the descriptions you provided
df_qced = df[
    (df['p31'] == df['p22001']) &  # Filter for same sex and genetic sex
    (df['p22006'] == 1) &          # Caucasian ethnic grouping
    (df['p22019'].isnull()) &       # No sex chromosome aneuploidy
    (df['p22021'] == 0)             # No kinship found
]

"""

In [61]:
df_qced = df_qced.rename(columns=lambda x: re.sub('p22009_a','pc',x))
# Rename the 'eid' column to 'IID' along with other relevant columns
df_qced = df_qced.rename(columns={
    'eid': 'IID',  # Rename 'eid' to 'IID'
    'p31': 'sex',
    'p34': 'year_of_birth',
    'p21022': 'age_at_recruitment',
    'p22001': 'genetic_sex',  # Rename p22001 to genetic_sex
    'p20160_i0': 'ever_smoked',
    'p22006': 'ethnic_group',  # Genetic ethnic grouping (Caucasian)
    'p22019': 'sex_chromosome_aneuploidy',  # Sex chromosome aneuploidy
    'p22021': 'kinship_to_other_participants',  # Kinship status
    'p29100': 'known_person_concerned_about_alcohol',  # Ever had known person concerned about alcohol consumption
    'p29011': 'ever_had_prolonged_sadness_or_depression',  # Ever had prolonged feelings of sadness or depression
    'p23104_i0': 'BMI',
    'p2966_i0': 'age_diagnosed_htn', # Age high blood pressure diagnosed
    'p22020': 'used in genetic principal components',
    'p41270': 'diagnoses' #ICD10
    
})



In [66]:
# Check the columns of df_qced to ensure 'IID' exists
print(df_qced.columns)

Index(['sex', 'genetic_sex', 'ethnic_group', 'sex_chromosome_aneuploidy',
       'year_of_birth', 'age_at_recruitment',
       'known_person_concerned_about_alcohol',
       'ever_had_prolonged_sadness_or_depression', 'BMI',
       'used in genetic principal components', 'age_diagnosed_htn', 'pc1',
       'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10',
       'diagnoses', 'ever_smoked', 'IID', 'chronic_pain_cc'],
      dtype='object')


#Filling Nans

In [67]:
# Assign 'FID' from 'IID'
df_qced['FID'] = df_qced['IID']

In [69]:
# Fill missing in binary/categorical with 0 or mode
df_qced['ever_smoked'].fillna(0, inplace=True)
df_qced['known_person_concerned_about_alcohol'].fillna(0, inplace=True)
df_qced['ever_had_prolonged_sadness_or_depression'].fillna(0, inplace=True)



In [70]:
import numpy as np
# Fill continuous variables with mean
continuous_cols = [
    'year_of_birth',
    'age_at_recruitment',
    'age_diagnosed_htn',
    'BMI'
]

for col in continuous_cols:
    if col in df_qced.columns:
        df_qced[col].fillna(df_qced[col].mean(), inplace=True)

9. Rename columns and organize it in format suitable for PLINK and regenie.

In [71]:
# Create a phenotype table from the QCed data
df_phenotype = df_qced[['FID', 'IID', 'chronic_pain_cc', 'sex', 'year_of_birth', 'age_at_recruitment', 'age_diagnosed_htn', 'ever_smoked',
       'known_person_concerned_about_alcohol',
       'ever_had_prolonged_sadness_or_depression', 'BMI', 'pc1',
       'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10', 'diagnoses'
       ]]

# Display the phenotype table
df_phenotype.head()


Unnamed: 0,FID,IID,chronic_pain_cc,sex,year_of_birth,age_at_recruitment,age_diagnosed_htn,ever_smoked,known_person_concerned_about_alcohol,ever_had_prolonged_sadness_or_depression,BMI,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,diagnoses
0,1002940,1002940,1,1,1940.0,67.0,46.551462,1,0,0,24.9,-13.9614,3.5332,-2.90706,6.88338,10.4112,-3.34036,2.71035,-2.17131,2.97747,-0.286813,"[C61, J459, J61, K20, K219, K221, K227, K269, ..."
1,1003285,1003285,1,1,1942.0,67.0,46.551462,1,0,0,27.7,-12.2651,1.76183,-3.5934,3.73171,-2.90846,-1.28071,-0.342788,-2.1415,3.24838,4.26346,"[C61, D508, H919, I849, K210, K227, K29, K317,..."
2,1007944,1007944,1,0,1948.0,60.0,54.0,1,0,1,26.94923,-10.5345,3.3125,-1.95293,-3.08537,-3.38068,0.040655,0.388952,-2.38331,-4.92998,2.26546,"[G454, G560, G610, G819, G822, H819, H830, I10..."
4,1014662,1014662,1,0,1946.0,61.0,46.551462,0,0,0,31.0,-13.3167,5.80744,-1.2005,6.71599,16.0219,0.389076,2.49887,5.86578,1.46635,0.534134,
5,1015654,1015654,1,1,1944.0,63.0,60.0,1,0,0,36.6,-11.4985,6.28084,-2.47919,3.1945,3.82175,-0.677714,-1.44048,0.8364,3.68877,-4.00073,"[A419, C446, E119, E872, F059, I10, L405, M073..."


In [None]:
df_phenotype.chronic_pain_cc.value_counts()



In [34]:
df_phenotype = df_phenotype.to_pandas()



In [None]:
df_no_qc = df_no_qc.to_pandas()

10. Select only samples that have WES data available and save them to CSV file.

In [32]:
# Define the base path and file naming convention
exome_folder = "/Bulk/Exome sequences/Population level exome OQFE variants, PLINK format - final release"
base_filename = "ukb23158_c"

In [33]:
# List of chromosomes to process (1 to 22)
chromosomes = list(range(1, 23))

# Placeholder for the combined dataframe
combined_df = pd.DataFrame()

# Loop through each chromosome and process the .fam files
for chrom in chromosomes:
    fam_file = f"{exome_folder}/{base_filename}{chrom}_b0_v1.fam"
    
    try:
        # Check if the .fam file exists using dxpy
        file_obj = dxpy.find_one_data_object(name=f"{base_filename}{chrom}_b0_v1.fam", folder=exome_folder, name_mode='exact')

        if file_obj:
            print(f"Processing chromosome {chrom}")
            
            # Get file ID and download it to a local file
            fam_file_id = file_obj["id"]
            local_filename = f"local_{base_filename}{chrom}_b0_v1.fam"
            
            # Download the file locally
            dxpy.download_dxfile(fam_file_id, local_filename)

            # Load the downloaded .fam file using pandas
            plink_fam_df = pd.read_csv(local_filename, delimiter='\s', dtype='object', 
                                       names=['FID', 'IID', 'Father ID', 'Mother ID', 'sex', 'Pheno'], engine='python')

            # Merge with phenotype data (assuming df_phenotype is already defined)
            chromosome_df = pd.merge(df_phenotype, plink_fam_df[['IID']], on='IID', how='inner')

            # Combine the data for all chromosomes
            combined_df = pd.concat([combined_df, chromosome_df])

            # Clean up: delete the local .fam file after processing
            os.remove(local_filename)

    except dxpy.DXError as e:
        print(f"File for chromosome {chrom} not found or an error occurred: {e}")

# Remove duplicates based on 'IID' to avoid counting the same individual multiple times
combined_df = combined_df.drop_duplicates(subset=['IID'])



Processing chromosome 1
Processing chromosome 2
Processing chromosome 3
Processing chromosome 4
Processing chromosome 5
Processing chromosome 6
Processing chromosome 7
Processing chromosome 8
Processing chromosome 9
Processing chromosome 10
Processing chromosome 11
Processing chromosome 12
Processing chromosome 13
Processing chromosome 14
Processing chromosome 15
Processing chromosome 16
Processing chromosome 17
Processing chromosome 18
Processing chromosome 19
Processing chromosome 20
Processing chromosome 21
Processing chromosome 22


In [27]:
combined_df.chronic_pain_cc.value_counts()

chronic_pain_cc
0    71195
1     3861
Name: count, dtype: int64

In [38]:
# Get imputed data
path_to_impute_file = f'/mnt/project/REGENIE_output/{imputation_folder}/ukb{imputation_field_id}_c1_b0_v1.sample'
sample_file = pd.read_csv(
    path_to_impute_file,
    delimiter='\s',
    header=0,
    names=['FID', 'IID', 'missing', 'sex'],
    engine='python',
)

# Check the data types
print("combined_df['IID'] dtype:", df_phenotype['IID'].dtype)
print("sample_file['IID'] dtype:", sample_file['IID'].dtype)

# Convert IID columns to the same type (string)
df_phenotype['IID'] = df_phenotype['IID'].astype(str)
sample_file['IID'] = sample_file['IID'].astype(str)

# Now try the join again
cpsp_df = df_phenotype.join(
    sample_file.set_index('IID'), on='IID', rsuffix='_sample', how='inner'
)

# Drop unuseful columns from .fam file
cpsp_df.drop(
    columns=['FID_sample', 'missing', 'sex_sample'],
    axis=1,
    inplace=True,
    errors='ignore',
)

combined_df['IID'] dtype: object
sample_file['IID'] dtype: int64


In [39]:
cpsp_df.chronic_pain_cc.value_counts()

chronic_pain_cc
0    74038
1     4010
Name: count, dtype: int64

In [40]:
# Save the combined phenotype data locally
output_filename = "cpsp.phe"

# Save the file in your local working environment
cpsp_df.to_csv(output_filename, sep='\t', na_rep='NA', index=False)
print(f"Saved combined file locally as {output_filename}")

# Define the destination path on DNAnexus
remote_dir = '/Data/'

Saved combined file locally as cpsp.phe


In [29]:
# Save the combined phenotype data locally
output_filename_no_QC = "no_sample_QC_chronic_pain_wes.phe"

# Save the file in your local working environment
df_no_qc.to_csv(output_filename_no_QC, sep='\t', na_rep='NA', index=False)
print(f"Saved combined file locally as {output_filename_no_QC}")

# Define the destination path on DNAnexus
remote_dir = '/Data/'

Saved combined file locally as no_sample_QC_chronic_pain_wes.phe


11. Load file to project storage.

In [30]:
# Upload the local file to the DNAnexus platform
dxpy.upload_local_file(output_filename_no_QC, folder=remote_dir)

print(f"Uploaded {output_filename_no_QC} to {remote_dir} on DNAnexus.")

NameError: name 'output_filename_no_QC' is not defined

In [42]:
# Upload the local file to the DNAnexus platform
dxpy.upload_local_file(output_filename, folder=remote_dir)

print(f"Uploaded {output_filename} to {remote_dir} on DNAnexus.")

Uploaded cpsp.phe to /Data/ on DNAnexus.


Here is an example of phenotype file:

In [41]:
# Path to the saved phenotypic file
phenotypic_file_path = "cpsp.phe"

# Load the phenotypic file into a pandas DataFrame
phenotypic_df = pd.read_csv(phenotypic_file_path, delimiter='\t')

# Display the first few rows of the DataFrame
phenotypic_df.head()

Unnamed: 0,FID,IID,chronic_pain_cc,sex,year_of_birth,Age_at_recruitment,ever_smoked,known_person_concerned_about_alcohol,ever_had_prolonged_sadness_or_depression,BMI,Genetic PCA,Diagnoses
0,1002940,1002940,1,1,1940.0,67.0,1,0,0,24.9,-13.9614,"['C61', 'J459', 'J61', 'K20', 'K219', 'K221', ..."
1,1003285,1003285,1,1,1942.0,67.0,1,0,0,27.7,-12.2651,"['C61', 'D508', 'H919', 'I849', 'K210', 'K227'..."
2,1007944,1007944,1,0,1948.0,60.0,1,0,1,26.94923,-10.5345,"['G454', 'G560', 'G610', 'G819', 'G822', 'H819..."
3,1014662,1014662,1,0,1946.0,61.0,0,0,0,31.0,-13.3167,
4,1015654,1015654,1,1,1944.0,63.0,1,0,0,36.6,-11.4985,"['A419', 'C446', 'E119', 'E872', 'F059', 'I10'..."
