In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Pool all PRS

In [12]:
outcomes = ['cad', 'stroke', 'hf', 'af', 'va', 'pad', 'aaa', 'vte']

merged_scores = None

for outcome in outcomes:
    score = pd.read_csv(f'/your path/multiomics-cardiovascular-disease/data/processed/prs/{outcome}/results/{outcome}.sscore', sep='\t')
    score = score[score['IID'] >= 0]
    score = score[['IID', 'SCORE1_AVG']].rename(columns={'IID': 'eid', 'SCORE1_AVG': f'prs_{outcome}'})

    if merged_scores is None:
        merged_scores = score
    else:
        merged_scores = pd.merge(merged_scores, score, on='eid', how='outer')

prs_columns = merged_scores.columns[1:]
scaler = StandardScaler()
merged_scores[prs_columns] = scaler.fit_transform(merged_scores[prs_columns])

withdrawal_list = pd.read_csv('/home/ukb/data/withdrawals/withdraw79146_204_20240527.txt', sep="\t", header=None)  
withdrawal_list = withdrawal_list[0].tolist()
merged_scores = merged_scores[~merged_scores['eid'].isin(withdrawal_list)]

merged_scores.to_csv('/your path/multiomics-cardiovascular-disease/data/processed/omics/PolygenicScores.csv', index=False)
merged_scores

Unnamed: 0,eid,prs_cad,prs_stroke,prs_hf,prs_af,prs_va,prs_pad,prs_aaa,prs_vte
0,1000010,-1.328725,-1.642050,-0.032679,0.543829,-0.031773,-0.748269,0.810402,-0.386231
1,1000028,-1.344649,0.007199,1.912503,-0.746832,-0.331382,0.059740,-0.134254,-0.562637
2,1000034,-0.138080,-0.783101,-1.208742,0.064065,-0.872737,0.427492,0.506582,0.359018
3,1000045,0.552114,-0.825653,-0.550937,-2.540375,-1.445154,0.203565,-0.016456,-1.615486
4,1000052,-0.820312,0.324878,0.199763,0.747311,0.440715,0.049045,0.750470,-0.384690
...,...,...,...,...,...,...,...,...,...
487029,6023624,-0.749422,-0.363330,2.345575,-0.128304,2.291085,0.050567,1.684602,-1.202535
487030,6023636,-0.019994,0.205271,0.453917,1.860164,0.329781,0.830051,0.843548,-0.740849
487031,6023648,1.162209,-0.029221,1.783827,1.290604,-0.433441,2.172354,1.776885,-1.455721
487032,6023655,-1.571803,2.059729,-0.670179,-0.601174,-1.245474,-0.738160,-0.813466,0.290824


In [13]:
merged_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 486965 entries, 0 to 487033
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   eid         486965 non-null  int64  
 1   prs_cad     486965 non-null  float64
 2   prs_stroke  486965 non-null  float64
 3   prs_hf      486965 non-null  float64
 4   prs_af      486965 non-null  float64
 5   prs_va      486965 non-null  float64
 6   prs_pad     486965 non-null  float64
 7   prs_aaa     486965 non-null  float64
 8   prs_vte     486965 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 37.2 MB


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

columns_of_interest = ['prs_cad', 'prs_stroke', 'prs_hf', 'prs_af', 'prs_va', 'prs_aaa', 'prs_pad', 'prs_vte']
data = merged_scores[columns_of_interest]

cosine_sim_matrix = cosine_similarity(data.T)

cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=columns_of_interest, columns=columns_of_interest)
cosine_sim_df

Unnamed: 0,prs_cad,prs_stroke,prs_hf,prs_af,prs_va,prs_aaa,prs_pad,prs_vte
prs_cad,1.0,0.114288,0.223852,0.007327,0.004441,0.091163,0.228969,0.027707
prs_stroke,0.114288,1.0,0.151914,0.137489,0.003119,-0.01886,0.216609,0.100746
prs_hf,0.223852,0.151914,1.0,0.168356,-0.030096,0.00664,0.204029,0.077004
prs_af,0.007327,0.137489,0.168356,1.0,-0.019154,-0.000115,0.001009,0.00105
prs_va,0.004441,0.003119,-0.030096,-0.019154,1.0,0.003167,-0.001176,0.006308
prs_aaa,0.091163,-0.01886,0.00664,-0.000115,0.003167,1.0,0.062304,0.007524
prs_pad,0.228969,0.216609,0.204029,0.001009,-0.001176,0.062304,1.0,0.1761
prs_vte,0.027707,0.100746,0.077004,0.00105,0.006308,0.007524,0.1761,1.0
