In [1]:
import pandas as pd
import numpy as np

# Mice methylation&RNA data - delete duplicate genes and save as .csv

In [47]:
for data_name in ["data_methylation_hm450", "data_mrna_seq_rpkm"]:
    print(f"Processing {data_name}")
    expression_data = pd.read_csv(f"../data/Mice/{data_name}.txt", index_col=0, sep='\t')
    if data_name == "data_methylation_hm450":
        expression_data.drop(['Entrez_Gene_Id'], inplace=True, axis=1)


    print(f"Original: {expression_data.shape}")

    # Check duplicated rows
    df = expression_data
    duplicated_rows = df[df.index.duplicated(keep=False)]
    print(f"duplicated rows: {len(duplicated_rows)}")

    # Compute row means
    row_means = df.mean(axis=1)

    # Get best row per gene (index) by mean
    best_rows = row_means.groupby(df.index).idxmax()

    # Select those rows
    df_deduped = df.loc[best_rows]

    # Now force unique index — remove any accidental duplicate labels
    df_deduped = df_deduped[~df_deduped.index.duplicated(keep='first')]

    print("Deduplicated:", df_deduped.shape)

    # Save as CSV
    df_deduped.to_csv(f'../data/Mice/{data_name}.csv')


Processing data_methylation_hm450
Original: (25883, 73)
duplicated rows: 36
Deduplicated: (25865, 73)
Processing data_mrna_seq_rpkm
Original: (19552, 66)
duplicated rows: 8
Deduplicated: (19548, 66)


# Mice methylation data - Impute missing genes with mean

In [2]:
meth_df = pd.read_csv('../data/Cleveland/methylation_imputed.csv', index_col=0)

mice_df = pd.read_csv('../data/Mice/data_methylation_hm450.csv',index_col=0)

In [4]:
meth_df

Unnamed: 0_level_0,A1BG,A2BP1,A2LD1,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK1,AAMP,...,ZSWIM7,ZW10,ZWINT,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MEC1,0.78393,0.55490,0.74393,0.28846,0.19457,0.23815,0.58240,0.11336,0.13458,0.11345,...,0.09947,0.15068,0.13683,0.35715,0.70193,0.33222,0.26345,0.20008,0.18074,0.72662
M14,0.84968,0.27184,0.88745,0.55872,0.20010,0.27441,0.24414,0.14742,0.17265,0.13678,...,0.14274,0.19162,0.15129,0.32659,0.70403,0.32992,0.29607,0.21270,0.19070,0.40704
MDAMB134VI,0.84884,0.44047,0.58004,0.51544,0.16695,0.17913,0.43582,0.10513,0.12987,0.10102,...,0.09759,0.13494,0.09719,0.25859,0.63699,0.31373,0.27388,0.17380,0.14815,0.57372
MCC26,0.76965,0.39123,0.81830,0.40430,0.20149,0.23971,0.21317,0.12263,0.13767,0.11726,...,0.10332,0.16757,0.12046,0.33940,0.69271,0.34277,0.27065,0.18393,0.16858,0.50135
MCC13,0.77499,0.45356,0.79889,0.38255,0.20208,0.23711,0.23129,0.12546,0.13952,0.11780,...,0.11241,0.16395,0.12104,0.34009,0.67837,0.33435,0.27938,0.18776,0.16798,0.52900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ICC3,0.79764,0.43894,0.81737,0.39870,0.17114,0.21492,0.19939,0.11488,0.13343,0.10639,...,0.09735,0.14092,0.10310,0.32754,0.67901,0.33334,0.24941,0.17248,0.16001,0.54184
ICC9,0.68602,0.51602,0.79501,0.40044,0.16386,0.20731,0.26373,0.10301,0.13411,0.10000,...,0.08782,0.12934,0.10061,0.26869,0.69777,0.30774,0.22858,0.17495,0.15519,0.64120
ICC12,0.64823,0.46904,0.79287,0.33122,0.19561,0.23294,0.23066,0.11852,0.14525,0.11275,...,0.10428,0.15722,0.11912,0.29987,0.72420,0.32530,0.25691,0.19706,0.17322,0.65651
CCLF_PEDS_0003_T,0.82129,0.39638,0.81042,0.39977,0.19645,0.23987,0.28208,0.12768,0.13989,0.11926,...,0.10992,0.16441,0.11698,0.32357,0.66248,0.34323,0.28625,0.19521,0.18436,0.45919


In [8]:
mice_genes = mice_df.index

missing_genes = []
for gene in meth_df.columns:
    if (gene.upper() not in mice_genes):
        missing_genes.append(gene)
print(f"there are {len(missing_genes)} missing genes")

miscased_genes = []
for gene in meth_df.columns:
    if (gene.upper() in mice_genes) and (gene not in mice_genes):
        miscased_genes.append(gene)
print(f"there are {len(miscased_genes)} mis-cased genes")
miscased_genes[:5]

# change mis-cased genes in the mice data:

for gene in miscased_genes:
    mice_df.rename(index={gene.upper():  gene}, inplace=True)

there are 249 missing genes
there are 811 mis-cased genes


fill the other missing genes with a mean value

In [9]:
print(f"mice dataset is {mice_df.shape}")
mice_samples = mice_df.columns

rows_to_add = []
for gene in missing_genes:
    mean_value = meth_df[gene].mean()
    row_values = pd.Series({sample: mean_value for sample in mice_samples}, name=gene)
    rows_to_add.append(row_values)

# Concatenate all new rows at once
if rows_to_add:
    new_rows_df = pd.DataFrame(rows_to_add)
    mice_df = pd.concat([mice_df, new_rows_df], axis=0)

print(f"after insertions mice dataset is {mice_df.shape}")

mice dataset is (25865, 73)
after insertions mice dataset is (26114, 73)


In [11]:
mice_df.T.to_csv('data/mice/mice_methylation_imputed.csv', index=True)

# Mice RNA - re-scale & impute missing genes with mean

First: run rna_normalization.R

In [110]:
mice_df = pd.read_csv('data/mice_rna_normalized.csv', index_col=0)

rna_df = pd.read_csv('../data/Cleveland/rna_imputed.csv', index_col=0)

mosa_df = pd.read_csv('data/transcriptomics_used_by_MOSA.csv', index_col=0)

In [84]:
# Check if imputed RNA and (normalized) mice RNA are matching
for df in [mice_df, rna_df, mosa_df]:
    print(f"min: {df.min().min()}, max {df.max().max()} mean {df.mean().mean()} median {df.median().median()}")
    print(f"negative values: {df.lt(0).sum().sum()}")
    print(df.shape)

min: 1.17643791621576, max 15.2407734391336 mean 4.655333829072499 median 4.853046105867335
negative values: 0
(66, 14362)
min: -11.39522, max 17.14427 mean 3.18137413668156 median 4.273594999999999
negative values: 1567134
(498, 15278)
min: -12.39979, max 18.18559 mean 3.050093310831003 median 4.348235000000001
negative values: 3559507
(15320, 1072)


In [85]:
values = list(rna_df.columns)

with open("data/rna_imputed_list_of_genes.txt", 'w') as output:
    for row in values:
        output.write(str(row) + '\n')

In [112]:
a = [mdf for mdf in mice_df.columns if mdf in rna_df.columns]
print(f"Mice genes matching RNA genes: {len(a)}")

a = [mdf for mdf in rna_df.columns if mdf not in mice_df.columns]
print(f"Missing genes wrt RNA: {len(a)}")

Mice genes matching RNA genes: 14362
Missing genes wrt RNA: 916


In [113]:
mice_genes = mice_df.columns

missing_genes = []
for gene in rna_df.columns:
    if gene not in mice_genes:
        missing_genes.append(gene)
print(f"there are {len(missing_genes)} missing genes")

miscased_genes = []
for gene in rna_df.columns:
    if (gene.upper() in mice_genes) and (gene not in mice_genes):
        miscased_genes.append(gene)
print(f"there are {len(miscased_genes)} mis-cased genes")
miscased_genes[:5]

# change mis-cased genes in the mice data:

for gene in miscased_genes:
    mice_df.rename(column={gene.upper():  gene}, inplace=True)

there are 916 missing genes
there are 0 mis-cased genes


In [114]:
print(f"mice dataset is {mice_df.shape}")

means = {gene: float(rna_df[gene].mean()) for gene in missing_genes}

new_cols_df = pd.DataFrame(index=mice_df.index, data=means)

mice_df = pd.concat([mice_df, new_cols_df], axis=1)

print(f"after insertions mice dataset is {mice_df.shape}")


mice dataset is (66, 14362)
after insertions mice dataset is (66, 15278)


In [115]:
a = [mdf for mdf in mice_df.columns if mdf in rna_df.columns]
print(f"Mice genes matching RNA genes: {len(a)}")

a = [mdf for mdf in rna_df.columns if mdf not in mice_df.columns]
print(f"Missing genes wrt RNA: {len(a)}")

Mice genes matching RNA genes: 15278
Missing genes wrt RNA: 0


In [116]:
mice_df.to_csv('data/mice/mice_rna_imputed.csv', index=True)