In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.linear_model import LinearRegression
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import median_absolute_error
from app import get_altum_age, get_hannum_age, get_Han2020_age, get_horvath_age, get_horvath_sb_age, get_pheno_age, get_YingCaus_age, get_ZhangEn_age

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
2024-07-15 17:06:19.784 
  command:

    streamlit run C:\Users\aksha\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [38]:
def preprocess_df(path, clock):
    df = pd.read_csv(path, index_col='Organ')
    clocks = ['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'pheno','ying', 'zhang', 'EnsembleNaive', 'EnsembleLR']
    df = df[clocks[:-2] + ['Real Age']]
    #Add Ensemble column that averages columns of 6 clocks
    #Add EnsembleLR column that multiplies each coefficient by a column value + intercept to get LR results
    df['EnsembleNaive'] = df.iloc[:, [0,1,3,4,6,7]].mean(axis=1)
    # coef_series = pd.Series([0.3437195049507392, -0.20633851531241806, -0.5173259165634722, 0.2836010547077375, 0.1857810346592119, -0.02208310995460718, -0.08526975203798473, 0.5640149630320757], index=df.columns)
    # df['EnsembleLR'] = df[:, :-2].mul(coef_series, axis=1).sum(axis=1) + (19.275324793823557)
    print(df)
    return df

def get_mae(path, organs=['Blood', 'Breast', 'Kidney', 'Lung', 'Muscle', 'Ovary', 'Prostate', 'Testis', 'Colon']):
    errors = {}
    df = path
    for organ in organs:
        array = df.loc[organ].to_numpy()
        # Calculate the number of prediction columns (m)
        m = array.shape[1] - 1  # Number of columns minus the last column (ground truth)

        # Initialize an array to store absolute errors for each row
        absolute_errors = np.zeros((array.shape[0], m))

        # Compute absolute errors for each row
        for i in range(array.shape[0]):
            predictions = array[i, :m]  # First m columns are predictions
            ground_truth = array[i, -1]  # Last column is ground truth
            
            # Compute absolute errors
            absolute_errors[i] = np.abs(predictions - ground_truth)
        errors[organ] = np.median(absolute_errors, axis=0).tolist()
    return errors

def get_aa(path, organs=['Blood', 'Breast', 'Kidney', 'Lung', 'Muscle', 'Ovary', 'Prostate', 'Testis', 'Colon']):
    errors = {}
    df = path
    for organ in organs:
        array = df.loc[organ].to_numpy()
        # Calculate the number of prediction columns (m)
        m = array.shape[1] - 1  # Number of columns minus the last column (ground truth)

        # Initialize an array to store absolute errors for each row
        absolute_errors = np.zeros((array.shape[0], m))

        # Compute absolute errors for each row
        for i in range(array.shape[0]):
            predictions = array[i, :m]  # First m columns are predictions
            ground_truth = array[i, -1]  # Last column is ground truth
            
            # Compute absolute errors
            absolute_errors[i] = predictions - ground_truth
        errors[organ] = np.mean(absolute_errors, axis=0).tolist()
    return errors


def draw_error_chart(df, clock_names):
    metric = 'Median Absolute Error'
    clock_labels = clock_names
    organs = ['Blood', 'Breast', 'Kidney', 'Lung', 'Muscle', 'Ovary', 'Prostate', 'Testis', 'Colon']
    data = []
    color_labels = ['red', 'green', 'blue', 'orange', 'purple', 'magenta', 'cyan', 'black', 'pink']
    arr = get_mae(df)
    for organ in organs:
        data.append(arr[organ])
    organs[0] = 'Whole Blood'
    data = np.array(data)
    sorted_indices = np.argsort(data[:,-1])
    data = data[sorted_indices]
    organs = np.array(organs)[sorted_indices].tolist()
    layout = go.Layout(
        paper_bgcolor='white',
        plot_bgcolor='white',
        width=1000,
        height=800,
        xaxis=dict(title='Organs'),
        yaxis=dict(title=metric),
        shapes=[
            dict(
                type='rect',
                xref='paper',
                yref='paper',
                x0=0,
                y0=0,
                x1=1,
                y1=1,
                line=dict(color='black', width=1)
            )
        ]
    )

    # Create figure object
    fig = go.Figure(layout=layout)

    # Loop through each row of the data array and create scatter traces
    for i in range(len(data[0])):
        if i != len(data) - 1:
            marker_symbol = 'circle-open'
        else:
            marker_symbol = 'diamond'
        fig.add_trace(go.Scatter(
            x=organs,
            y=data[:, i],
            mode='lines+markers',
            name=clock_labels[i],
            line=dict(color=color_labels[i]),  # Assigning a specific color from color_labels
            marker=dict(symbol=marker_symbol, size=10, color=color_labels[i])
        ))

    fig.show()

def draw_error_plot(df, clock_names):
    metric = 'Median Absolute Error'
    clock_labels = clock_names
    organs = ['Blood', 'Breast', 'Kidney', 'Lung', 'Muscle', 'Ovary', 'Prostate', 'Testis', 'Colon']
    data = []
    color_labels = ['red', 'green', 'blue', 'orange', 'purple', 'magenta', 'cyan', 'pink', 'black']
    arr = {clock_labels[i]: [row[i] for row in get_mae(df).values()] for i in range(len(clock_labels))}
    for organ in clock_labels:
        data.append(arr[organ])
    data = np.array(data)

    layout = go.Layout(
        paper_bgcolor='white',  # Set the background color to white
        plot_bgcolor='white',   # Set the plot area background color to white
        width=1000,               # Set the width of the entire plot (in pixels)
        height=800,              # Set the height of the entire plot (in pixels)
        #title= title,
        xaxis=dict(title='Clocks'),
        yaxis=dict(title=metric),
        boxgap=0.5,       # Adjust the box gap
        boxgroupgap=0.5 ,  # Adjust the box group gap
        shapes=[
            # Rectangle shape for the plot area outline
            dict(
                type='rect',
                xref='paper',  # The x reference is set to 'paper' for relative coordinates
                yref='paper',  # The y reference is set to 'paper' for relative coordinates
                x0=0,          # x-coordinate of the lower left corner
                y0=0,          # y-coordinate of the lower left corner
                x1=1,          # x-coordinate of the upper right corner
                y1=1,          # y-coordinate of the upper right corner
                line=dict(
                    color='black',  # Border color (you can use any valid color name or HEX code)
                    width=1         # Border width in pixels
                )
            ),
            # dict(
            #    type='line',
            #    xref='paper',
            #    x0=0.02,  # You can set the x-coordinate range for the line if needed
            #    x1=0.98,
            #    y0=0,
            #    y1=0,
            #    line=dict(color='red', width=2),  # Set the color and width of the line
            # )
        ]
    )

    # Create the box plot
    fig = go.Figure(layout=layout)

    
    for i, values in enumerate(data):
            fig.add_trace(go.Box(y=values, name=clock_labels[i], showlegend=False, boxpoints=False, line=dict(color='gray')))

    
    y_coords = []
    for i in range(len(clock_labels)):
        for j in range(len(organs)):
            symbol = ['circle']
            size=7
            width=2
                
            if i == 0:
                fig.add_trace(go.Scatter(
                    x=[clock_labels[i]],
                    y=[arr[clock_labels[i]][j]],
                    mode='markers',
                    marker=dict(color='rgba(0, 0, 0, 0)',symbol=symbol, size=size, line=dict(width=width, color=[color_labels[j]])),
                    name=organs[j],
                    showlegend=True,
                ))
                y_coords.append(arr[clock_labels[i]][j])
            else:
                fig.add_trace(go.Scatter(
                    x=[clock_labels[i]],
                    y=[arr[clock_labels[i]][j]],
                    mode='markers',
                    marker=dict(color='rgba(0, 0, 0, 0)',symbol=symbol, size=size, line=dict(width=width, color=[color_labels[j]])),
                    name=organs[j],
                    showlegend=False,
                ))
                y_coords.append(arr[clock_labels[i]][j])

    fig.update_layout(legend=dict(x=1.02, y=0.96, font=dict(size=14)))
    fig.update_xaxes(tickfont=dict(size=20), titlefont=dict(size=25)) 
    fig.update_yaxes(tickfont=dict(size=20), titlefont=dict(size=25)) 
    fig.show()

def draw_accel_plot(df, clock_names):
    metric = 'Age Acceleration'
    clock_labels = clock_names
    point_colors = ['red', 'green', 'blue', 'orange', 'purple', 'magenta', 'lime', 'navy', 'brown', 'cyan']
    organs = ['Blood', 'Breast', 'Kidney', 'Lung', 'Muscle', 'Ovary', 'Prostate', 'Testis', 'Colon']
    data = []
    
    # arr = {clock_labels[i]: [row[i] for row in get_aa(df).values()] for i in range(len(clock_labels))}
    arr= get_aa(df)
    for organ in organs:
        data.append(arr[organ])
    data = np.array(data)
    layout = go.Layout(
        paper_bgcolor='white',  # Set the background color to white
        plot_bgcolor='white',   # Set the plot area background color to white
        width=1000,               # Set the width of the entire plot (in pixels)
        height=800,              # Set the height of the entire plot (in pixels)
        #title= title,
        xaxis=dict(title='Organs'),
        yaxis=dict(title=metric),
        boxgap=0.5,       # Adjust the box gap
        boxgroupgap=0.5 ,  # Adjust the box group gap
        shapes=[
            # Rectangle shape for the plot area outline
            dict(
                type='rect',
                xref='paper',  # The x reference is set to 'paper' for relative coordinates
                yref='paper',  # The y reference is set to 'paper' for relative coordinates
                x0=0,          # x-coordinate of the lower left corner
                y0=0,          # y-coordinate of the lower left corner
                x1=1,          # x-coordinate of the upper right corner
                y1=1,          # y-coordinate of the upper right corner
                line=dict(
                    color='black',  # Border color (you can use any valid color name or HEX code)
                    width=1         # Border width in pixels
                )
            ),
            # dict(
            #    type='line',
            #    xref='paper',
            #    x0=0.02,  # You can set the x-coordinate range for the line if needed
            #    x1=0.98,
            #    y0=0,
            #    y1=0,
            #    line=dict(color='red', width=2),  # Set the color and width of the line
            # )
        ]
    )

    # Create the box plot
    fig = go.Figure(layout=layout)

    

    for i, values in enumerate(data):
        if i == 0:
            fig.add_trace(go.Box(y=values, name='Whole Blood', showlegend=False, boxpoints=False, line=dict(color='gray')))
        else:
            fig.add_trace(go.Box(y=values, name=organs[i], showlegend=False, boxpoints=False, line=dict(color='gray')))

    
    y_coords = []
    for i in range(len(organs)):
        for j in range(len(clock_labels)):
            if j != len(clock_labels) - 1:
                symbol = ['circle']
                size=7
                width=2
            else:
                symbol = ['diamond']
                size=9
                width=2
                
            if i == 0:
                fig.add_trace(go.Scatter(
                    x=["Whole Blood"],
                    y=[arr[organs[i]][j]],
                    mode='markers',
                    marker=dict(color='rgba(0, 0, 0, 0)',symbol=symbol, size=size, line=dict(width=width, color=[point_colors[j]])),
                    name=clock_labels[j],
                    showlegend=True,
                ))
                y_coords.append(arr[organs[i]][j])
            else:
                fig.add_trace(go.Scatter(
                    x=[organs[i]],
                    y=[arr[organs[i]][j]],
                    mode='markers',
                    marker=dict(color='rgba(0, 0, 0, 0)',symbol=symbol, size=size, line=dict(width=width, color=[point_colors[j]])),
                    name=clock_labels[j],
                    showlegend=False,
                ))
                y_coords.append(arr[organs[i]][j]) 

    fig.update_layout(legend=dict(x=1.02, y=0.96, font=dict(size=14)))
    fig.update_xaxes(tickfont=dict(size=20), titlefont=dict(size=25)) 
    fig.update_yaxes(tickfont=dict(size=20), titlefont=dict(size=25)) 
    fig.show()

def draw_plot_matrix(data_path, output_path, clocks):
    df = data_path
    clocks = clocks
    organs = ['Blood', 'Breast', 'Colon', 'Kidney', 'Lung', 'Muscle', 'Ovary', 'Prostate', 'Testis']
    for j in range(len(clocks)):
            for i in range(len(organs)):
                pred = df.loc[organs[i]].to_numpy()[:,j]
                truth = df.loc[organs[i]].to_numpy()[:,len(clocks)]
                plt.plot(truth, pred, 'o', color='black', mfc='none', markersize=6, markeredgewidth=3)
                lin_reg = LinearRegression(fit_intercept=True).fit(truth.reshape(-1,1), pred.reshape(-1,1))
                plt.plot(np.arange(0,100), (np.arange(0,100) * lin_reg.coef_[0]) + lin_reg.intercept_, color='red', label='slope = ' + str(round(float(lin_reg.coef_[0]), 2)), linewidth=3)
                plt.plot(np.arange(0,100), np.arange(0,100), color='blue', label='y=x', linewidth=3)
                if clocks[j] == 'EnsembleAge':
                    plt.title('EmsembleLR on ' + organs[i] + ' (N = ' + str(len(pred))+ ')')
                else:
                    plt.title(clocks[j] + ' on ' + organs[i] + ' (N = ' + str(len(pred))+ ')')
                plt.xlim(0,100)
                plt.ylim(0,100)
                plt.tick_params(axis='both', which='major', labelsize=10)
                plt.tick_params(axis='both', which='major', length=10, width=5)
                plt.gca().spines['left'].set_linewidth(3) 
                plt.gca().spines['bottom'].set_linewidth(3) 
                plt.gca().spines['top'].set_linewidth(3) 
                plt.gca().spines['right'].set_linewidth(3) 
                plt.legend(fontsize=10, loc='upper left')
                plt.xlabel("Chronological Age", fontsize=10)
                plt.ylabel("Predicted Age", fontsize=10)
                plt.savefig('plots_dir/' + str(i) + '_' + str(j) + '.png')
                # if clocks[j] == 'EnsembleAge':
                #     plt.show()
                plt.clf()

    # Dimensions of each image (assuming they are all the same size)
    image_width = 640  # Adjust according to your image size
    image_height = 480  # Adjust according to your image size

    # Number of rows and columns in the grid
    rows = len(organs)  # 0 to 8 inclusive
    cols = len(clocks)  # 0 to 7 inclusive

    # Create a blank canvas for the final image grid
    final_image = Image.new('RGB', (cols * image_width, rows * image_height))

    # Iterate over each image and paste it into the final grid
    for n in range(rows):
        for m in range(cols):
            # Load each image
            image_path = f"plots_dir/{n}_{m}.png"  # Replace with your actual directory path
            img = Image.open(image_path)
            
            # Calculate the position to paste the current image
            x_offset = m * image_width
            y_offset = n * image_height
            
            # Paste the current image into the final image grid
            final_image.paste(img, (x_offset, y_offset))

    # Save the final image grid
    final_image.save(output_path)  # Replace with your desired save path

Averaging Clocks code

In [None]:
# plot_matrix_output_path = "plots_dir/grid_image.png"
# data_path = 'all_clocks_res.csv'
# clocks = ['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'pheno','ying', 'zhang', 'EnsembleAge']
# clock_names = ['Altum', 'Han2020', 'Hannum', 'Horvath', 'Skin Blood','Pheno' ,'YingCausal', 'ZhangEn', 'EnsembleAge']
# df = preprocess_df(data_path, clocks)
# draw_error_plot(df, clock_names)
# draw_accel_plot(df, clock_names)
# draw_plot_matrix(df, plot_matrix_output_path, clocks)

In [None]:
# plot_matrix_output_path = "plots_dir/grid_image.png"
# data_path = 'all_clocks_res.csv'
# clocks = ['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'ying', 'zhang', 'EnsembleAge']
# clock_names = ['Altum', 'Han2020', 'Hannum', 'Horvath', 'Skin Blood', 'YingCausal', 'ZhangEn', 'EnsembleAge']
# df = preprocess_df(data_path, clocks)
# draw_error_plot(df, clock_names)
# draw_accel_plot(df, clock_names)
# draw_plot_matrix(df, plot_matrix_output_path, clocks)

In [37]:
plot_matrix_output_path = "plots_dir/grid_image.png"
data_path = 'all_clocks_res.csv'
# clocks = ['altum', 'han', 'horvath', 'horvath_SB', 'ying', 'zhang', 'EnsembleAge']
# clock_names = ['Altum', 'Han2020', 'Horvath', 'Skin Blood', 'YingCausal', 'ZhangEn', 'EnsembleAge']
clocks = ['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'pheno','ying', 'zhang', 'EnsembleNaive', 'EnsembleLR']
clock_names = ['Altum', 'Han2020', 'Hannum', 'Horvath', 'Skin Blood','Pheno' ,'YingCausal', 'Zhang2019', 'EnsembleNaive', 'EnsembleLR']
df = preprocess_df(data_path, clocks)
draw_error_plot(df, clock_names)
draw_accel_plot(df, clock_names)
draw_plot_matrix(df, plot_matrix_output_path, clocks)

IndexError: list index out of range

Lin Reg Code

In [33]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import Ridge

df = pd.read_csv(data_path, index_col='Organ')
test_df = pd.DataFrame(columns=['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'pheno', 'ying', 'zhang', 'Real Age'])
train_df = pd.DataFrame(columns=['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'pheno', 'ying', 'zhang', 'Real Age'])
X_train_all = []
Y_train_all = []
for organ in ['Blood', 'Breast', 'Colon', 'Kidney', 'Lung', 'Muscle', 'Ovary', 'Prostate', 'Testis']:
    df  = df[['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'pheno', 'ying', 'zhang', 'Real Age']]
    df['EnsembleNaive'] = df.iloc[:, :-1].mean(axis=1)
    Y = df['Real Age'].loc[organ].to_numpy()
    X = df[['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'pheno', 'ying', 'zhang']].loc[organ].to_numpy()
    label_counts = Counter(Y)

    threshold = 5
    more_than_treshold = [label for label, count in label_counts.items() if count > threshold]
    less_equal_threshold = [label for label, count in label_counts.items() if count <= threshold]

    X_more_than_treshold = X[np.isin(Y, more_than_treshold)]
    Y_more_than_treshold = Y[np.isin(Y, more_than_treshold)]

    X_less_equal_treshold = X[np.isin(Y, less_equal_threshold)]
    Y_less_equal_treshold = Y[np.isin(Y, less_equal_threshold)]

    X_smote, Y_smote = SMOTE(random_state=42, k_neighbors=threshold).fit_resample(X_more_than_treshold, Y_more_than_treshold)

    X_balanced =  np.concatenate((X_smote, X_less_equal_treshold), axis=0)
    Y_balanced =  np.concatenate((Y_smote, Y_less_equal_treshold), axis=0)

    X_train, X_test, Y_train, Y_test = train_test_split(X_balanced, Y_balanced, test_size=0.4, random_state=42)

    def find_indices_not_in_X2(X1, X2):
        # Convert arrays to tuples of rows for comparison
        X1_tuples = [tuple(row) for row in X1]
        X2_tuples = [tuple(row) for row in X2]
        
        # Find indices where rows in X1 are not in X2
        indices_not_in_X2 = np.where(~np.isin(X1_tuples, X2_tuples))[0]
        
        return indices_not_in_X2
    
    indice = find_indices_not_in_X2(X, X_train)[::8]
    X_test = X[indice]
    Y_test = Y[indice]

    for i in range(len(X_test)):
        new_row = pd.DataFrame(columns=['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'pheno', 'ying', 'zhang', 'Real Age'])
        new_row.loc[organ] = X_test[i].tolist() + [float(Y_test[i])]
        test_df = pd.concat((test_df, new_row))

    for i in range(len(X_train)):
        X_train_all.append(X_train[i].tolist())
        Y_train_all.append(Y_train[i].tolist())

X_train_all = np.array(X_train_all)
Y_train_all = np.array(Y_train_all)

model = Ridge(alpha=0.1, fit_intercept=True)
model.fit(X_train_all, Y_train_all)

preds = model.predict(test_df.values[:, :-1])
new_col = pd.Series(preds, name='EnsembleLR')
new_col.index = test_df.index
test_df.insert(len(test_df.columns) - 1, 'EnsembleLR', new_col)

clocks = ['altum', 'han', 'hannum', 'horvath', 'horvath_SB', 'pheno', 'ying', 'zhang', 'EnsembleNaive']
clock_names = ['Altum', 'Han2020', 'Hannum', 'Horvath', 'Skin Blood', 'Pheno', 'YingCausal', 'Zhang2019', 'EnsembleNaive']

# for i in range(len(clock_names[:-1])):
#     print(clock_names[i] + ' coefficient: ' + str(model.coef_[i]))
# print('intercept: ' + str(model.intercept_))

draw_error_plot(test_df, clock_names)
draw_accel_plot(test_df, clock_names)
draw_plot_matrix(test_df, plot_matrix_output_path, clocks)
    

<Figure size 640x480 with 0 Axes>

Pearson code

In [132]:
from scipy.stats import pearsonr

clocks = ['altum', 'han', 'horvath', 'horvath_SB', 'ying', 'zhang']
def get_true_age_labels(file):
    age_data = pd.read_table(file).iloc[9,1:]
    ages = []
    for i in range(len(age_data)):
        ages.append(int(age_data[i].split('-')[0])+5)
    gender_data = pd.read_table(file).iloc[6,1:]
    genders = []
    for i in range(len(gender_data)):
        genders.append(int(gender_data[i]) - 1)
    return np.array(ages), np.array(genders)

cpgs_list = []
for clock in clocks:
    with open('tools/' + clock + '_cpgs_list.txt', 'r') as file:
        lines = file.readlines()
        lines = [line.strip() for line in lines]
        cpgs_list = cpgs_list + lines
cpgs_list = list(set(cpgs_list))

organs = ['Blood', 'Breast', 'Colon', 'Kidney', 'Lung', 'Muscle', 'Ovary', 'Prostate', 'Testis', 'All Data']
all_methyl_df = pd.DataFrame()
all_pearsons = []
for organ in organs[:-1]:
    df = pd.read_table('input/GTEx/GTEx_' + organ + '.meth.csv')

    # mean_vals = df.iloc[:, - (df.shape[1] - 1):].mean().tolist()
    # missing_cpgs = set(cpgs_list) - set(df.iloc[:, 0].tolist())
    # print(len(missing_cpgs))
    real_age, real_gender = get_true_age_labels('input/GTEx/GTEx_' + organ + '.anno.csv')
    # for value in missing_cpgs:
    #     new_row = [value] +  mean_vals # Create a new row with value and 0.5 in other columns
    #     df.loc[len(df)] = new_row  # Add new row to DataFrame
    df = df.set_index(df.columns[0])
    df = df.loc[list(set(cpgs_list) - set(['cg21298523', 'cg15443907', 'cg03046819', 'cg20775254', 'cg26045205', 'cg25098644', 'cg08122369', 'cg20728496', 'cg03302287', 'cg04376617', 'cg15241130', 'cg05966235', 'cg11911418', 'cg26147845', 'cg25365379', 'cg03600687', 'cg11545887', 'cg25428494', 'cg04117338', 'cg02105377', 'cg06493612', 'cg25788012', 'cg22872478', 'cg27391693', 'cg17373751', 'cg21939482', 'cg22681495', 'cg19072037', 'cg06275642', 'cg08173606', 'cg22051763', 'cg03476370', 'cg17990871', 'cg16094954', 'cg24341129', 'cg17263013', 'cg01994779', 'cg10058540', 'cg12737574', 'cg16007185', 'cg17344932', 'cg02901139', 'cg27646965', 'cg19273182', 'cg14613972', 'cg03438101', 'cg00547018', 'cg05473175', 'cg07398350', 'cg06506864', 'cg12212060', 'cg09573389', 'cg24079702', 'cg04785213', 'cg00659129', 'cg05205664', 'cg13234848', 'cg04219321', 'cg14178895', 'cg14138171', 'cg17543123', 'cg25999267', 'cg08927738', 'cg24088229', 'cg26266098', 'cg03887528', 'cg05321960', 'cg03747695', 'cg16562257', 'cg17563769', 'cg04820387', 'cg19835478', 'cg09607276', 'cg16677885', 'cg05168404', 'cg24117442', 'cg09785172', 'cg08797606', 'cg12578480', 'cg23280730', 'cg26069745', 'cg15411984', 'cg09361966', 'cg02304930', 'cg15488251', 'cg19955500', 'cg05087948', 'cg06179486', 'cg20932765', 'cg14329157', 'cg21618439', 'cg18427589', 'cg26936171', 'cg25152404', 'cg07613278', 'cg10247252', 'cg15572787', 'cg25149927', 'cg04187545', 'cg08332662', 'cg18644286', 'cg15747933', 'cg17272642', 'cg05238695', 'cg11954355', 'cg11126134', 'cg27376817', 'cg09896106', 'cg11670211', 'cg03785807', 'cg26364871', 'cg09087966', 'cg13735974', 'cg14927277', 'cg12758687', 'cg12042659', 'cg11405695', 'cg08415508', 'cg02600515', 'cg10450322', 'cg25302370', 'cg01481441', 'cg21820677', 'cg02309273', 'cg10453040', 'cg11877382', 'cg15605888', 'cg07155684', 'cg23698058', 'cg22215631', 'cg01773854', 'cg15863539', 'cg12629325', 'cg15596301', 'cg24471894', 'cg11781389', 'cg20147046', 'cg08025960', 'cg06291334', 'cg07903918', 'cg14700821', 'cg23036025', 'cg25932066', 'cg22377237', 'cg16341373', 'cg27051315', 'cg18619831', 'cg15739944', 'cg26357453', 'cg16779976', 'cg14932684', 'cg00210842', 'cg06665322', 'cg04126427', 'cg11946583', 'cg13372488', 'ch.8.353716R', 'cg12148898', 'cg12542604', 'cg17729941', 'cg27566805', 'cg13485809', 'cg14472778', 'cg09374949', 'cg14047008', 'cg18413900', 'cg10146929', 'cg06658468', 'cg22697325', 'cg21492308', 'cg22021786', 'cg17408647', 'cg20284673', 'cg11326793', 'cg16796590', 'cg17092956', 'cg21092324', 'cg04497885', 'cg18811423', 'cg10958002', 'cg22295573', 'cg27015174', 'cg08724517', 'cg23018448', 'cg11593656', 'cg21527708', 'cg25007680', 'cg06672696', 'cg09296044', 'cg07979752', 'cg13326338', 'cg27516159', 'cg23634477', 'cg12316010', 'cg22942200', 'cg15707568', 'cg17298973', 'cg13767001', 'cg18050997', 'cg15673110', 'cg12914014', 'cg22889918', 'cg17683775', 'cg24532669', 'cg19285688', 'cg27016307', 'cg09338170', 'cg01511567', 'cg27096232', 'cg18219226', 'cg17681698', 'cg12288726', 'cg04616566', 'cg13722123', 'cg05564251', 'cg13058581', 'cg06366981', 'cg12513379', 'cg04033650', 'cg13634678', 'cg09375488', 'cg18722841', 'cg23032316', 'cg25859012', 'cg05467458', 'cg10809491', 'cg08797194', 'cg22800631', 'cg19154173', 'cg24445405', 'cg01988129', 'cg19953038', 'cg20496643', 'cg08661227', 'cg24792272', 'cg05779068', 'cg03664992', 'cg08931376', 'cg06725035', 'cg02757432', 'cg15481539', 'cg10025865', 'cg23226134', 'cg18392482', 'ch.4.73355803R', 'cg21611708', 'cg04838627', 'cg12365667', 'cg09009259', 'cg09279566', 'cg13202122', 'cg26795848', 'cg00200653', 'cg01491225', 'cg12179288', 'cg12126248', 'cg18248112', 'cg01578341', 'cg25830305', 'cg14839932', 'cg07009002', 'cg03844971', 'cg21922841', 'cg16001460', 'cg00461841', 'cg16494477', 'cg18190433', 'cg09453737', 'cg17459635', 'cg07274506', 'cg21642649', 'cg00155167', 'cg10878896', 'cg13258563', 'cg25770948', 'cg24171453', 'cg18125510', 'cg02306162', 'cg15383520', 'cg25985103', 'cg06547285', 'cg26168651', 'cg12830829', 'cg03164928', 'cg07844021', 'cg15869022', 'cg23337754', 'cg16250754', 'cg02065387', 'cg15727320', 'cg10409680', 'cg08859916', 'cg05260372', 'cg26637901', 'cg13654195', 'cg23306832', 'cg19167673', 'cg27655905', 'cg20287640', 'cg06148175', 'cg23896056', 'cg02342494', 'cg09937438', 'cg25922239', 'cg12644353', 'cg05470939', 'cg17436656', 'cg13410437', 'cg09872233', 'cg18953280', 'cg15736336', 'cg11608114', 'cg20051177', 'cg11484872', 'cg21206959', 'cg23260993', 'cg17390562', 'cg18357645', 'cg04063348', 'cg27321750', 'cg04619859', 'cg25418831', 'cg20532887', 'cg25667997', 'cg24832140', 'cg05189291', 'cg16257091', 'cg25527547', 'cg14916288', 'cg25425078', 'cg16173109', 'cg24977886', 'cg04762213', 'cg07295034', 'cg26530200', 'cg26647453', 'cg06386533', 'cg02965178', 'cg25809722', 'cg16185365', 'cg26820922', 'cg08935003', 'cg26199493', 'cg26790132', 'cg03684977', 'cg22283058', 'cg15798530', 'cg03382304', 'cg09752703', 'cg12188416', 'cg19945840', 'cg19904653', 'cg10253371', 'cg21642251', 'cg13444538', 'cg12556991', 'cg18473521', 'cg12666263', 'cg07390013', 'cg01035616', 'cg14018471', 'cg11654620', 'cg25990230', 'cg20704028', 'cg24949488', 'cg17890764', 'cg24010402', 'cg01813965', 'cg23408913', 'cg04229238', 'cg06356454', 'cg20525917', 'cg15903421', 'cg25732028', 'cg27519424', 'cg15622917', 'cg18137414', 'cg01889448', 'cg11655691', 'cg24735937', 'cg25893857', 'cg00917893', 'cg22464186', 'cg26729026', 'ch.2.30415474F', 'cg03750407', 'cg06995715', 'cg24400943', 'cg19632760', 'cg16511841', 'cg03909500', 'cg09974041', 'cg14795305', 'cg22680204', 'cg10305797', 'cg03203114', 'cg18139900', 'cg14593290', 'cg21329085', 'cg15977816', 'cg01311051', 'cg04512892', 'cg07319199', 'cg21504624', 'cg18196295', 'cg03085637', 'cg07707498', 'cg23792364', 'cg13645811', 'cg00648153', 'cg24176037', 'cg17514226', 'cg08212685', 'cg06117855', 'cg15214092', 'cg22814929', 'cg07303143', 'cg03807314', 'cg19831575', 'cg12257692', 'cg10773869', 'cg22040809', 'cg09386615', 'cg08674093', 'cg17701886', 'cg25219134', 'cg14133708', 'cg01675895', 'cg05380910', 'cg16721202', 'cg24934400', 'cg16427670', 'cg20957370', 'cg25473981', 'cg23054676', 'cg06537230', 'cg06737494', 'ch.17.1184801R', 'cg00398048', 'cg15329467', 'cg11620135', 'cg20326410', 'cg09929564', 'cg21165219', 'cg00630583', 'cg20368283', 'cg18587364', 'cg08688335', 'cg24084891', 'cg09651136', 'cg06459327', 'cg13243219', 'cg10999479', 'cg08166232', 'cg09450197', 'cg25519723', 'cg12535715', 'cg19812283', 'cg02501978', 'cg01817393', 'cg03165700', 'cg10110474', 'cg01381846', 'cg02831393', 'cg02904235', 'cg11913104', 'cg12646585', 'cg01776246', 'cg01371477', 'cg04092800', 'cg23274244', 'ch.13.39564907R', 'cg19475108', 'cg05727959', 'cg02916816', 'cg13682722', 'cg27091343', 'cg03883502', 'cg09869858', 'cg10367730', 'cg04743872', 'cg04368877', 'cg23957915', 'cg12067287', 'cg08521225', 'cg05313261', 'cg26767761', 'cg12414301', 'cg17582250', 'cg26235243', 'cg04856689', 'cg03986400', 'cg19569684', 'cg02284889', 'cg14802310', 'cg11189837', 'cg13131015', 'cg12952136', 'cg06851000', 'cg22807700', 'cg12835684', 'cg22825487', 'cg05172940', 'cg14423778', 'cg00436282', 'cg04121771', 'cg02724472', 'cg25399541', 'cg05200311', 'cg02121943', 'cg01630869', 'cg00650762', 'cg13633026', 'cg17904739', 'cg16551261', 'cg16689634', 'cg06150803', 'cg27519373', 'cg03604424', 'cg06024411', 'cg18003135', 'cg04338788', 'cg04451175', 'cg09871315', 'cg21678388', 'cg12991341', 'cg04752565', 'cg18202456', 'cg26311454', 'cg16592658', 'cg02654291', 'cg05411032', 'cg27567593', 'cg26020513', 'cg01105058', 'cg15787227', 'cg21289015', 'cg20245568', 'cg14236602', 'cg05726109', 'cg15086884', 'cg15792688', 'cg23240961', 'cg15824080', 'cg05669210', 'cg03454353', 'cg19039841', 'cg15270892', 'cg18384097', 'cg24497877', 'cg12696750', 'cg17576375', 'cg07984980', 'cg14149007', 'cg09205751', 'cg04267526', 'cg14318370', 'cg18704595', 'cg12884406', 'cg08462924', 'cg21249152', 'cg15605172', 'cg20881910', 'cg02430692', 'cg12643449', 'cg22225219', 'cg02475653', 'cg08576197', 'cg09868597', 'cg27398547', 'cg06251129', 'cg16310717', 'cg14361627', 'cg27529647', 'cg11388238', 'cg14241323', 'cg23114594', 'cg14426525', 'cg23858360', 'cg24101359', 'cg14981132', 'cg18292394', 'cg18641050', 'cg20263942', 'cg20861237', 'cg21667943', 'cg06151964', 'cg04705866', 'cg10000775', 'cg08089301', 'cg17754980', 'cg17133183', 'cg12419685', 'cg01488147', 'cg08426384', 'cg01892695', 'cg07928695', 'cg01503516', 'cg24760922', 'cg19767249', 'cg12187213', 'cg17304433', 'cg20969242', 'cg16280313', 'cg03221914', 'cg19046959', 'cg25802871', 'cg26809210', 'cg23690166', 'cg18775149', 'cg20795863', 'cg22926560', 'cg10365880', 'cg14948436', 'cg07809027', 'cg01533387', 'cg15565533', 'cg17920197', 'cg03102516', 'cg05164185', 'cg13080465', 'cg15213491', 'cg03148461', 'cg23735442', 'cg25598083', 'cg13297960', 'cg20789691', 'cg15687659', 'cg12638745', 'cg24558204', 'cg03834467', 'cg25969212', 'cg14614643', 'cg08108311', 'cg23632840', 'cg25956985', 'cg20346096', 'cg06007645', 'cg08671671', 'cg26935333', 'cg09638834', 'cg08662074', 'cg01990334', 'cg25762395', 'cg08596544', 'cg16591681', 'cg25339052', 'cg27175491', 'cg06491116', 'cg17416146', 'cg17607024', 'cg02972551', 'cg03848890', 'cg23067299', 'cg06321883', 'cg01808130', 'cg17895873', 'cg00149659', 'cg22341104', 'cg15597540', 'cg12624523', 'cg00432979', 'cg02655623', 'cg11180122', 'cg14672293', 'cg21968169', 'cg00280814', 'cg25101056', 'cg10557683', 'cg17646721', 'cg14091223', 'cg22995176', 'cg06718696', 'cg16361890', 'cg19595170', 'cg14671488', 'cg20023578', 'cg26764555', 'cg02888247', 'cg26097271', 'cg12188860', 'cg17353431', 'cg20780880', 'cg13098855', 'cg11846333', 'cg05590257', 'cg17304878', 'cg16004055', 'cg02767634', 'cg01139966', 'cg16098332', 'cg01971089', 'cg21139312', 'cg07927379', 'cg17352004', 'cg20011974', 'cg16762979', 'cg20139214', 'cg27436995', 'cg24903376', 'cg02756614', 'cg17848389', 'cg23074747', 'cg26850754', 'cg15078479', 'cg24481633', 'cg07503829', 'cg08129490', 'cg02646854', 'cg07207937', 'cg09688763', 'cg17851105', 'cg21712678', 'cg13877915', 'cg08126211', 'cg08198370', 'cg00214855', 'cg07414384', 'cg03955296', 'cg16633951', 'cg10021735', 'cg03264414', 'cg23285059', 'cg23282674', 'cg01738984', 'cg01036173', 'cg21754343', 'cg20627916', 'cg09559551', 'cg12503243', 'cg19692192', 'cg08228917', 'cg12368241', 'cg24687051', 'cg04599297', 'cg04431054', 'cg00309204', 'cg25017250', 'cg26665419', 'cg03823084', 'cg19455368', 'cg19511338', 'cg11244402', 'cg12627583', 'cg11337525', 'cg27363310', 'cg25172835', 'cg06885782', 'cg23520347', 'cg00551244', 'cg07155455', 'cg15269875', 'cg11114344', 'cg08578305', 'cg13882835', 'cg14545899', 'cg14372394', 'cg01725199', 'cg17971003', 'cg15350455', 'cg16121444', 'cg12274479', 'cg27319898', 'cg04001802', 'cg01936270', 'cg20525378', 'ch.2.75889792R', 'cg12403575', 'cg15308737', 'cg20585500', 'cg24034289', 'cg05130485', 'cg14800883', 'cg19356324', 'cg15316289']))]
    df.loc['Real Age'] = real_age.tolist()
    df = df.T
    if organ == 'Blood':
        all_methyl_df = df
    else:
        all_methyl_df = pd.concat((all_methyl_df, df))

    # Extract Y label as a numpy array
    y_label = df.iloc[:, -1].values

    # Compute Pearson correlation coefficient for each feature with Y label
    correlations = {}
    for column in df.columns[:-1]:  # Exclude the last column (Y label)
        feature_values = df[column].values
        corr_coefficient, _ = pearsonr(feature_values, y_label)
        correlations[column] = corr_coefficient
    all_pearsons.append(correlations)

df = all_methyl_df
# Extract Y label as a numpy array
y_label = df.iloc[:, -1].values

# Compute Pearson correlation coefficient for each feature with Y label
correlations = {}
for column in df.columns[:-1]:  # Exclude the last column (Y label)
    feature_values = df[column].values
    corr_coefficient, _ = pearsonr(feature_values, y_label)
    correlations[column] = corr_coefficient
all_pearsons.append(correlations)
print(all_pearsons)
print(len(all_pearsons))

[{'cg08460435': 0.10148350063958135, 'cg20047055': -0.2661806188787035, 'cg11754095': 0.2891596388587596, 'cg17264470': -0.13620771467974668, 'cg11399590': -0.042924978395941726, 'cg18075299': -0.013332375213031171, 'cg01124961': -0.32123508869197487, 'cg10498798': -0.5003334322134863, 'cg26453990': -0.13002032948355102, 'cg24490859': 0.029776515761182308, 'cg03446427': -0.21507810921080325, 'cg13221588': -0.09982794931848576, 'cg03476195': 0.3484623264551295, 'cg00091693': -0.3182256646553659, 'cg00412772': 0.19386618839403558, 'cg04345475': 0.10203927125613652, 'cg02121427': -0.2271615842547479, 'cg12072803': 0.27186441550132656, 'cg27550918': -0.14383650509957768, 'cg12290002': 0.10693115933987088, 'cg24635866': 0.02364441536644829, 'cg13421924': 0.23804716626185418, 'cg18450227': -0.3450539990515004, 'cg24995240': -0.029279921241596424, 'cg17639290': -0.03496479056966113, 'cg13851211': -0.021600917783176427, 'cg23090046': 0.15359898119725746, 'cg01663968': 0.0018782489480124774, 'c

In [135]:
res = pd.DataFrame(columns = ['cpgs'] + [organ + ' pearson coeff' for organ in organs])
i = 0
for pearsons in all_pearsons:
    res['cpgs'] = pearsons.keys()
    res[organs[i] + ' pearson coeff'] = pearsons.values()
    i = i + 1
print(res)

             cpgs  Blood pearson coeff  Breast pearson coeff  \
0      cg08460435             0.101484              0.135750   
1      cg20047055            -0.266181              0.088344   
2      cg11754095             0.289160             -0.135743   
3      cg17264470            -0.136208             -0.074824   
4      cg11399590            -0.042925             -0.035040   
...           ...                  ...                   ...   
20856  cg04518155            -0.059203              0.235154   
20857  cg17877494             0.052294             -0.075447   
20858  cg11063110             0.170567             -0.205045   
20859  cg15652797            -0.420507             -0.069246   
20860  cg10313633             0.068235             -0.154975   

       Colon pearson coeff  Kidney pearson coeff  Lung pearson coeff  \
0                 0.311777              0.446898            0.416078   
1                -0.178312             -0.311198           -0.128218   
2              

In [136]:
res.to_csv('pearsons.csv')