In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import LeaveOneOut
from os import listdir

In [2]:
#SET PATH
#Input takes 'locally' or 'remotely' and returns an absolute path based on the input 
location=input('Are you working locally or remotely today?')

if location=='locally':
    path="/Users/cameronkelsey/Documents/smack_lab/cayo_data/"
elif location=='remotely':
    path="/scratch/ckelsey4/Cayo_meth/"

In [7]:
metadata

Unnamed: 0,monkey_id,lid,lid_pid,reads,unique,non_unique,meth_cpg,unmeth_cpg,reads_msp1,perc_chg_meth,perc_chh_meth,chrX_ratio,prep_date,parentName,grantparent_tissueType,individual_sex,date_birth,group,processing_timestamp,age_at_sampling
0,94H,LID_101336,LID_101336_PID_10042,23127829,11982435,10518332,13987051,17381896,21389482,0.8%,0.5%,0.463179,2018-09-06,DID_1001155,whole_blood,M,1998-01-16,,2012-01-18,14.01
1,O52,LID_101337,LID_101337_PID_10042,17867566,9500464,7931050,11972964,11128247,16523233,0.8%,0.6%,0.797373,2018-08-30,DID_1001156,whole_blood,F,1988-01-16,,2014-01-15,26.00
2,00O,LID_101338,LID_101338_PID_10042,15876928,10256418,5174014,6982073,7540174,8264281,0.7%,0.7%,0.879725,2018-08-30,DID_1001158,whole_blood,F,1999-12-01,,2014-01-28,14.16
3,9E6,LID_101342,LID_101342_PID_10042,17232480,9189594,7529510,10359856,16591143,16454191,0.7%,0.5%,0.434197,2018-08-30,DID_1001166,whole_blood,M,2007-10-15,,2012-12-18,5.18
4,52P,LID_101343,LID_101343_PID_10042,17198789,9506094,7255730,10717038,14644166,16088491,0.8%,0.5%,0.775294,2018-08-30,DID_1001167,whole_blood,F,2000-12-17,,2012-03-14,11.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3425,2A7,LID_111782,LID_111782_PID_10496,22194753,12830840,8830123,12557888,26216856,20933910,0.6%,0.4%,0.738029,2022-07-12,DID_1006566,whole_blood,F,2004-09-25,,2020-11-16,16.14
3426,59A,LID_111783,LID_111783_PID_10496,29154718,18201850,10277278,14609153,39632177,25781766,0.5%,0.4%,0.757783,2022-07-12,DID_1006578,whole_blood,F,1992-11-29,,2020-11-17,27.97
3427,6F0,LID_111784,LID_111784_PID_10496,18994727,12421684,6091679,10293731,25582672,16894127,0.6%,0.4%,0.454552,2022-07-12,DID_1006590,whole_blood,M,2007-09-28,,2020-12-01,13.18
3428,1G0,LID_111785,LID_111785_PID_10496,18232997,11194358,6620264,10864544,19899763,16837355,0.6%,0.4%,0.786452,2022-07-12,DID_1006602,whole_blood,F,2008-10-18,,2020-12-03,12.13


In [6]:
#Import metadata
metadata=pd.read_csv(path + "metadata_temp_clean_241106.txt", sep="\t")
metadata=metadata[metadata['grantparent_tissueType'] == "whole_blood"]


#long_data=pd.read_csv(path + "long_data_adjusted.txt", sep="\t")
#long_data=long_data[long_data['n'] > 1]
#long_data=long_data.sort_values(by=['lid_pid'])

In [5]:
#Define natural sorting function
def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

In [6]:
#Import coverage data per chromosome
dnam_path=path + "dnam_clock/"

cov_files=[f for f in listdir(dnam_path)
        if "cov" in f]

cov_files=sorted(cov_files, key=natural_key)        

cov_full=[]

for file in cov_files:
        file=dnam_path + file        
        dd=pd.read_csv(file, sep=" ")
        cov_full.append(dd)

#Remove last instance of dd to free up memory
del dd

In [10]:
#Import count data per chromosome
m_files=[f for f in listdir(dnam_path)
        if "m" in f]

m_files=sorted(m_files, key=natural_key)        

m_full=[]

for file in m_files:
        file=dnam_path + file        
        dd=pd.read_csv(file, sep=" ")
        m_full.append(dd)

#Remove last instance of dd to free up memory
del dd

In [9]:
#Import M and Cov for regions
#Import Cov and subset cols to lid_pid in metadata
regions_cov=pd.read_csv(dnam_path + "regions_cov.txt", sep=" ")
regions_cov=regions_cov[regions_cov.columns.intersection(long_data['lid_pid'])]
                        
regions_m=pd.read_csv(dnam_path + "regions_m.txt", sep=" ")
regions_m=regions_m[regions_m.columns.intersection(regions_cov.columns)]

#Match metadata columns to M and Cov matrices
long_data=long_data[long_data['lid_pid'].isin(regions_cov.columns)]

In [10]:
#Check that col names for M, Cov and metadata lid_pid are all equal and generate %methylation matrix
if all(regions_m.columns == regions_cov.columns) == True and all(regions_m.columns == long_data['lid_pid']) == True:
    p_meth=regions_m/regions_cov
    p_meth=p_meth.fillna(0)
    print("M and Cov columns match. Col LIDs and metadata LIDs match. %methylation matrix generated. NAs replaced with 0's. Onwards!")


M and Cov columns match. Col LIDs and metadata LIDs match. %methylation matrix generated. NAs replaced with 0's. Onwards!


In [12]:
#Remove X-chromosome from p_meth
#Generate list of chrs where n=n of regions
chrs=[i[0] for i in p_meth.index.str.split('_', n=1)]
p_meth['chr']=chrs

#Filter out X-chrom
p_meth=p_meth[p_meth['chr'] != 'X']

#Remove chr col from p_meth
p_meth=p_meth.drop('chr', axis=1)

#Print list of reminaing chrs to be sure
print(p_meth['chr'].unique())

['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15' '16'
 '17' '18' '19' '20']


114067