In [5]:
import pandas as pd
import numpy as np


In [183]:
def create_synthetic_data():
    """
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df
    """
    # Parameters
    num_patients = 500
    num_genes = 300
    num_key_genes = 10  # Number of genes particularly relevant to the disease
    num_pathways_per_level = [150, 50, 20, 10]
    num_key_pathways = 15  # Number of pathways particularly relevant to the disease

    # Simulate gene expression data
    gene_expression = np.random.rand(num_patients, num_genes)

    # Introduce noise
    gene_expression_noise = np.random.normal(0, 2, gene_expression.shape)
    gene_expression_noisy = gene_expression + gene_expression_noise

    # Introduce correlated noise for a subset of genes
    correlated_genes_indices = np.random.choice(num_genes, 200, replace=False)
    correlated_noise = np.random.normal(0, 2, (num_patients, 1))
    gene_expression_noisy[:, correlated_genes_indices] += correlated_noise

    # Select key genes
    key_genes = np.random.choice(num_genes, num_key_genes, replace=False)

    pathways = {}
    key_pathways = set()
    parent_child_pairs = []

    for level in range(4):
        for i in range(num_pathways_per_level[level]):
            pathway_name = f'Level_{level}Pathway{i}'
            if level == 0:
                pathway_genes = np.random.choice(num_genes, size=np.random.randint(5, 10), replace=False)
            else:
                # Choose a random number of parents (for example, between 1 and 3)
                num_parents = np.random.randint(1, 3)
                for _ in range(num_parents):
                    parent_pathway = f'Level_{level-1}Pathway{np.random.randint(num_pathways_per_level[level-1])}'
                    parent_genes = pathways[parent_pathway]
                    num_genes_to_choose = min(len(parent_genes), np.random.randint(3, 10))
                    pathway_genes = np.random.choice(parent_genes, size=num_genes_to_choose, replace=False)

                    # Record each parent-child pair
                    parent_child_pairs.append((parent_pathway, pathway_name))

            pathways[pathway_name] = pathway_genes
            if i < num_key_pathways:
                key_pathways.add(pathway_name)

    # Creating DataFrame from parent-child pairs
    parent_child_df = pd.DataFrame(parent_child_pairs, columns=['parent', 'child'])



    # Now, parent_child_df contains the required data
    # Add noise pathways
    num_noise_pathways = 100
    for i in range(num_noise_pathways):
        pathways[f'Noise_Pathway_{i}'] = np.random.choice(num_genes, size=np.random.randint(5, 15), replace=False)

    def nonlinear_transform(expression):
        # Combining multiple mathematical functions for complexity
        return np.sin(expression * np.pi) * np.tanh(expression) + np.cos(expression * np.pi / 2) * np.exp(expression)

    # Function to simulate disease status with additional complexity
    """def simulate_disease_status(gene_expression, pathways, key_pathways):
        disease_status = []
        for patient in gene_expression:
            status = 0
            for pathway, genes in pathways.items():
                if 'Noise' not in pathway:
                    mean_expression = nonlinear_transform(np.mean(patient[genes]))
                    pathway_influence = 4 if pathway in key_pathways else 1
                    status += pathway_influence if mean_expression > 1 else 0

            # Adjust overall threshold for disease status
            disease_threshold = len(pathways) / 3  # Adjusted to a fixed value
            disease_status.append(1 if status > disease_threshold else 0)
            
        return disease_status"""
    def simulate_disease_status(gene_expression, pathways, key_pathways, key_genes):
        disease_status = []
        for patient in gene_expression:
            status = 0
            key_gene_expression = np.mean(patient[key_genes])
            
            for pathway, genes in pathways.items():
                if 'Noise' not in pathway:
                    mean_expression = nonlinear_transform(np.mean(patient[genes]))
                    pathway_influence = 4 if pathway in key_pathways else 1
                    status += pathway_influence if mean_expression > 1 else 0

            # Adjust overall threshold for disease status
            disease_threshold = len(pathways) / 3  # Adjusted to a fixed value

            # Influence of key genes
            key_gene_influence = 2 if key_gene_expression > 1 else -1
            status += key_gene_influence

            disease_status.append(1 if status > disease_threshold else 0)
            
        return disease_status

    # Generate disease status
    disease_status = simulate_disease_status(gene_expression_noisy, pathways, key_pathways, key_genes)
    disease_status = [status + 1 for status in disease_status]
    # Create DataFrame
    df = pd.DataFrame(gene_expression_noisy, columns=[f'Gene_{i}' for i in range(num_genes)])
    df['group'] = disease_status

    qm_matrix_synthetic = df.iloc[:, :-1]
    design_matrix_synthetic = df.iloc[:, [-1]]
    translations_synthetic = pd.DataFrame([(number, pathway) for pathway, numbers in pathways.items() for number in numbers], columns=['input', 'translation'])
    
    # transform qm matrix
    qm_matrix_synthetic = qm_matrix_synthetic.T
    qm_matrix_synthetic.reset_index(inplace=True)
    qm_matrix_synthetic.rename(columns={'index': 'Genes'}, inplace=True)
    qm_matrix_synthetic.columns = ['Protein'] + [f'Patient{i-1}' for i in range(1, len(qm_matrix_synthetic.columns))]
    # transform design_matrix
    design_matrix_synthetic = design_matrix_synthetic.reset_index()
    design_matrix_synthetic.rename(columns={'index': 'sample'}, inplace=True)
    design_matrix_synthetic['sample'] = design_matrix_synthetic['sample'].apply(lambda x: f'Patient{x}')
    # Transorm translation
    #translations_synthetic.rename(columns={'Number': 'Gene'}, inplace=True)
    translations_synthetic["input"] = translations_synthetic["input"].apply(lambda x: f'Gene_{x}')
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df
    

# Gives the synthetic data
qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df = create_synthetic_data()

In [147]:
import numpy as np
import pandas as pd

def create_synthetic_data():
    """
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df
    """
    # Parameters
    num_patients = 500
    num_genes = 300
    num_key_genes = 20  # Number of genes particularly relevant to the disease
    num_pathways_per_level = [150, 50, 20, 10]
    num_key_pathways = 20  # Number of pathways particularly relevant to the disease

    # Simulate gene expression data
    gene_expression = np.random.rand(num_patients, num_genes)
    #gene_expression_noisy = gene_expression

    # Introduce noise
    gene_expression_noise = np.random.normal(0, 2, gene_expression.shape)
    gene_expression_noisy = gene_expression + gene_expression_noise
    
    
    # Introduce correlated noise for a subset of genes
    correlated_genes_indices = np.random.choice(num_genes, 200, replace=False)
    correlated_noise = np.random.normal(0, 2, (num_patients, 1))
    gene_expression_noisy[:, correlated_genes_indices] += correlated_noise
    #gene_expression_noisy = np.where(gene_expression_noisy < 0, 0, gene_expression_noisy)

    # Select key genes
    key_genes = np.random.choice(num_genes, num_key_genes, replace=False)
    

    pathways = {}
    key_pathways = set()
    parent_child_pairs = []

    for level in range(4):
        for i in range(num_pathways_per_level[level]):
            pathway_name = f'Level_{level}Pathway{i}'
            if level == 0:
                pathway_genes = np.random.choice(num_genes, size=np.random.randint(5, 10), replace=False)
                pathways[pathway_name] = pathway_genes
            else:
                # Choose a random number of parents (for example, between 1 and 3)
                num_parents = np.random.randint(1, 3)
                for _ in range(num_parents):
                    parent_pathway = f'Level_{level-1}Pathway{np.random.randint(num_pathways_per_level[level-1])}'
                    parent_genes = pathways[parent_pathway]
                    num_genes_to_choose = min(len(parent_genes), np.random.randint(3, 10))
                    pathway_genes = np.random.choice(parent_genes, size=num_genes_to_choose, replace=False)

                    # Record each parent-child pair
                    parent_child_pairs.append((parent_pathway, pathway_name))

            pathways[pathway_name] = pathway_genes
            if i < num_key_pathways:
                key_pathways.add(pathway_name)

    # Creating DataFrame from parent-child pairs
    parent_child_df = pd.DataFrame(parent_child_pairs, columns=['parent', 'child'])



    # Now, parent_child_df contains the required data
    # Add noise pathways
    num_noise_pathways = 100
    for i in range(num_noise_pathways):
        pathways[f'Noise_Pathway_{i}'] = np.random.choice(num_genes, size=np.random.randint(5, 15), replace=False)

    def nonlinear_transform(expression):
        # Combining multiple mathematical functions for complexity
        return np.sin(expression * np.pi) * np.tanh(expression) + np.cos(expression * np.pi / 2) * np.exp(expression)

    # Function to simulate disease status with additional complexity
    def simulate_disease_status(gene_expression, pathways, key_pathways, key_genes):
        disease_status = []
        for patient in gene_expression:
            status = 0
            for pathway, genes in pathways.items():
                if 'Noise' not in pathway:
                    mean_expression = nonlinear_transform(np.mean(patient[key_genes]))
                    #print(mean_expression)
                    pathway_influence = 1 if pathway in key_pathways else 0
                    status += pathway_influence if mean_expression > 1 else 0
            print(status)        
            key_gene_expression = nonlinear_transform(np.mean(patient[key_genes]))
            key_gene_influence = 50 if key_gene_expression > 1 else -1
            # Adjust overall threshold for disease status
            disease_threshold = len(pathways) / 3  # Adjusted to a fixed value
            # Influence of key genes
            
            status += key_gene_influence
            #print(status)
            #print(status)
            disease_status.append(1 if status > disease_threshold else 0)
            
        return disease_status

    # Generate disease status
    disease_status = simulate_disease_status(gene_expression_noisy, pathways, key_pathways, key_genes)


    ######################## All good below ########################
    disease_status = [status + 1 for status in disease_status]
    # Create DataFrame
    df = pd.DataFrame(gene_expression_noisy, columns=[f'Gene_{i}' for i in range(num_genes)])
    df['group'] = disease_status

    qm_matrix_synthetic = df.iloc[:, :-1]
    design_matrix_synthetic = df.iloc[:, [-1]]
    translations_synthetic = pd.DataFrame([(number, pathway) for pathway, numbers in pathways.items() for number in numbers], columns=['input', 'translation'])
    
    # transform qm matrix
    qm_matrix_synthetic = qm_matrix_synthetic.T
    qm_matrix_synthetic.reset_index(inplace=True)
    qm_matrix_synthetic.rename(columns={'index': 'Genes'}, inplace=True)
    qm_matrix_synthetic.columns = ['Protein'] + [f'Patient{i-1}' for i in range(1, len(qm_matrix_synthetic.columns))]
    # transform design_matrix
    design_matrix_synthetic = design_matrix_synthetic.reset_index()
    design_matrix_synthetic.rename(columns={'index': 'sample'}, inplace=True)
    design_matrix_synthetic['sample'] = design_matrix_synthetic['sample'].apply(lambda x: f'Patient{x}')
    # Transorm translation
    #translations_synthetic.rename(columns={'Number': 'Gene'}, inplace=True)
    translations_synthetic["input"] = translations_synthetic["input"].apply(lambda x: f'Gene_{x}')
    return pathways,gene_expression_noisy,qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df,key_genes,key_pathways

In [149]:
pathways,gene_expression_noisy,qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df,key_genes,key_pathways = create_synthetic_data()


0
0
0
0
70
0
0
70
0
0
70
0
0
0
0
70
0
0
70
0
0
70
0
0
0
70
0
70
0
0
70
0
0
0
0
70
0
0
0
70
0
70
0
0
0
0
0
0
70
0
0
0
0
0
0
0
0
0
0
0
70
0
0
0
70
0
70
70
0
0
70
70
70
0
70
0
0
70
0
0
0
70
70
0
0
0
0
0
0
0
0
0
0
0
70
70
0
0
0
0
0
70
0
0
0
0
70
0
70
0
0
70
70
0
0
0
0
70
0
0
0
0
0
0
0
0
70
0
0
0
0
70
0
70
0
0
0
70
0
70
70
0
70
0
70
0
0
70
0
0
0
0
70
0
0
0
0
0
0
70
0
0
0
0
0
70
0
70
0
0
70
0
70
70
0
0
70
0
0
0
0
0
0
70
0
0
70
70
0
70
70
0
0
70
70
0
70
70
0
0
70
0
0
0
0
70
70
70
0
70
0
0
0
0
70
70
0
0
0
0
0
70
0
0
0
0
70
0
70
0
0
0
0
70
0
0
0
70
0
70
70
70
70
0
0
70
0
0
70
0
0
70
70
70
70
0
0
0
0
0
0
0
0
0
0
0
0
70
0
70
0
70
70
70
0
0
0
0
70
0
0
0
0
0
0
70
0
0
0
0
0
0
0
0
0
0
0
70
0
0
0
0
70
70
70
70
70
0
0
0
0
70
70
0
70
0
0
0
0
70
0
0
0
0
70
70
0
70
0
0
70
0
0
0
0
0
0
0
0
70
0
0
0
0
0
0
0
70
0
0
0
0
0
0
70
0
0
0
0
70
0
0
0
0
0
0
0
0
0
0
70
0
70
0
0
70
0
0
70
0
0
0
0
70
0
0
0
0
0
0
0
0
70
0
70
70
70
70
0
0
0
70
0
0
70
0
0
0
70
70
0
0
0
0
0
70
70
70
0
70
70
0
0
0
0
0
0
70
0
70
0
0
0
0
0
0
0


In [151]:
def nonlinear_transform(expression):
    # Combining multiple mathematical functions for complexity
    return np.sin(expression * np.pi) * np.tanh(expression) + np.cos(expression * np.pi / 2) * np.exp(expression)

disease_status = []
for patient in gene_expression_noisy:
    status = 0
    for pathway, genes in pathways.items():
        if 'Noise' not in pathway:
            print
            #mean_expression = nonlinear_transform(np.mean(patient[key_genes]))
            # print(mean_expression)

0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383096127
0.8933106383

## Old stuff

In [2]:
import numpy as np
import pandas as pd

import numpy as np
import pandas as pd

def create_synthetic_data():
    """
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df
    """
    # Parameters
    num_patients = 500
    num_genes = 300
    num_key_genes = 20  # Number of genes particularly relevant to the disease
    num_pathways_per_level = [150, 50, 20, 10]
    num_key_pathways = 20  # Number of pathways particularly relevant to the disease

    # Simulate gene expression data
    gene_expression = np.random.rand(num_patients, num_genes)
    #gene_expression_noisy = gene_expression

    # Introduce noise
    gene_expression_noise = np.random.normal(0, 2, gene_expression.shape)
    gene_expression_noisy = gene_expression + gene_expression_noise
    
    
    # Introduce correlated noise for a subset of genes
    correlated_genes_indices = np.random.choice(num_genes, 200, replace=False)
    correlated_noise = np.random.normal(0, 2, (num_patients, 1))
    gene_expression_noisy[:, correlated_genes_indices] += correlated_noise
    #gene_expression_noisy = np.where(gene_expression_noisy < 0, 0, gene_expression_noisy)

    # Select key genes
    key_genes = np.random.choice(num_genes, num_key_genes, replace=False)
    
    for key_gene in key_genes:
        gene_expression_noisy[:, key_gene] *= 2

    pathways = {}
    key_pathways = set()
    parent_child_pairs = []

    for level in range(4):
        for i in range(num_pathways_per_level[level]):
            pathway_name = f'Level_{level}Pathway{i}'
            if level == 0:
                pathway_genes = np.random.choice(num_genes, size=np.random.randint(5, 10), replace=False)
                pathways[pathway_name] = pathway_genes
            else:
                # Choose a random number of parents (for example, between 1 and 3)
                num_parents = np.random.randint(1, 3)
                for _ in range(num_parents):
                    parent_pathway = f'Level_{level-1}Pathway{np.random.randint(num_pathways_per_level[level-1])}'
                    parent_genes = pathways[parent_pathway]
                    num_genes_to_choose = min(len(parent_genes), np.random.randint(3, 10))
                    pathway_genes = np.random.choice(parent_genes, size=num_genes_to_choose, replace=False)

                    # Record each parent-child pair
                    parent_child_pairs.append((parent_pathway, pathway_name))

            pathways[pathway_name] = pathway_genes
            if i < num_key_pathways:
                key_pathways.add(pathway_name)

    # Creating DataFrame from parent-child pairs
    parent_child_df = pd.DataFrame(parent_child_pairs, columns=['parent', 'child'])



    # Now, parent_child_df contains the required data
    # Add noise pathways
    num_noise_pathways = 100
    for i in range(num_noise_pathways):
        pathways[f'Noise_Pathway_{i}'] = np.random.choice(num_genes, size=np.random.randint(5, 15), replace=False)

    def nonlinear_transform(expression):
        # Combining multiple mathematical functions for complexity
        return np.sin(expression * np.pi) * np.tanh(expression) + np.cos(expression * np.pi / 2) * np.exp(expression)

    # Function to simulate disease status with additional complexity
    def simulate_disease_status(gene_expression, pathways, key_pathways, key_genes):
        disease_status = []
        for patient in gene_expression:
            status = 0
            for pathway, genes in pathways.items():
                if 'Noise' not in pathway:
                    mean_expression = nonlinear_transform(np.mean(patient[genes]))
                    #print(mean_expression)
                    pathway_influence = 2 if pathway in key_pathways else 1
                    status += pathway_influence if mean_expression > 1 else 0

            key_gene_expression = nonlinear_transform(np.mean(patient[key_genes]))
            key_gene_influence = 50 if key_gene_expression > 1 else -1
            # Adjust overall threshold for disease status
            disease_threshold = len(pathways) / 3  # Adjusted to a fixed value
            # Influence of key genes
            
            status += key_gene_influence
            #print(status)
            print(status)
            disease_status.append(1 if status > disease_threshold else 0)
            
        return disease_status

    # Generate disease status
    disease_status = simulate_disease_status(gene_expression_noisy, pathways, key_pathways, key_genes)


    ######################## All good below ########################
    disease_status = [status + 1 for status in disease_status]
    # Create DataFrame
    df = pd.DataFrame(gene_expression_noisy, columns=[f'Gene_{i}' for i in range(num_genes)])
    df['group'] = disease_status

    qm_matrix_synthetic = df.iloc[:, :-1]
    design_matrix_synthetic = df.iloc[:, [-1]]
    translations_synthetic = pd.DataFrame([(number, pathway) for pathway, numbers in pathways.items() for number in numbers], columns=['input', 'translation'])
    
    # transform qm matrix
    qm_matrix_synthetic = qm_matrix_synthetic.T
    qm_matrix_synthetic.reset_index(inplace=True)
    qm_matrix_synthetic.rename(columns={'index': 'Genes'}, inplace=True)
    qm_matrix_synthetic.columns = ['Protein'] + [f'Patient{i-1}' for i in range(1, len(qm_matrix_synthetic.columns))]
    # transform design_matrix
    design_matrix_synthetic = design_matrix_synthetic.reset_index()
    design_matrix_synthetic.rename(columns={'index': 'sample'}, inplace=True)
    design_matrix_synthetic['sample'] = design_matrix_synthetic['sample'].apply(lambda x: f'Patient{x}')
    # Transorm translation
    #translations_synthetic.rename(columns={'Number': 'Gene'}, inplace=True)
    translations_synthetic["input"] = translations_synthetic["input"].apply(lambda x: f'Gene_{x}')
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df,key_genes,key_pathways
    


qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df,key_genes,key_pathways = create_synthetic_data()




62
108
84
56
66
101
41
151
87
79
114
57
134
120
103
28
108
140
122
80
78
165
132
73
106
83
9
17
127
112
12
94
43
79
49
117
20
117
111
73
121
133
178
95
8
109
210
14
62
164
49
144
88
146
8
49
162
85
15
16
115
101
166
134
137
136
78
4
104
171
133
159
134
29
140
47
100
74
42
94
8
91
94
145
116
7
117
46
17
119
91
28
71
85
62
75
123
108
153
130
58
122
172
139
89
34
119
114
32
138
160
86
159
72
129
41
123
135
33
32
98
66
85
45
25
99
171
50
158
108
79
145
40
20
142
85
136
113
10
42
119
92
84
70
116
128
90
138
112
144
75
77
152
93
138
136
98
140
12
124
55
79
119
169
123
91
161
192
87
118
117
101
83
12
86
71
51
94
67
133
61
90
59
122
155
95
128
119
70
112
198
104
91
116
108
4
182
18
110
107
161
37
87
221
125
8
61
83
117
132
78
81
6
139
110
13
16
92
142
83
119
114
94
61
155
112
104
63
105
92
142
135
39
35
80
28
23
81
61
68
124
123
71
117
101
130
22
24
31
130
57
47
22
72
91
14
77
70
35
55
54
204
76
72
124
105
102
126
100
18
55
92
30
120
80
77
83
59
52
89
26
27
79
138
45
59
156
111
120
81
77
68
10

In [107]:
design_matrix_synthetic.value_counts("group")

group
1    322
2    178
Name: count, dtype: int64

In [80]:
gene_expression = np.random.rand(200, 1)
gene_expression/3

array([[0.30826135],
       [0.15346898],
       [0.20668869],
       [0.10059676],
       [0.29446391],
       [0.16127561],
       [0.04158132],
       [0.14254473],
       [0.29797364],
       [0.30030395],
       [0.03908261],
       [0.12594429],
       [0.19658462],
       [0.1427752 ],
       [0.18303369],
       [0.12083638],
       [0.29238913],
       [0.05785393],
       [0.2544203 ],
       [0.26665771],
       [0.17444251],
       [0.04475809],
       [0.28025968],
       [0.01305873],
       [0.29801504],
       [0.18702642],
       [0.11312986],
       [0.07629123],
       [0.11254936],
       [0.10679086],
       [0.15023623],
       [0.04885244],
       [0.32065511],
       [0.314664  ],
       [0.06094014],
       [0.28945995],
       [0.2514855 ],
       [0.12864801],
       [0.21782928],
       [0.01898965],
       [0.03122482],
       [0.2906372 ],
       [0.04303685],
       [0.22508135],
       [0.22701986],
       [0.17849904],
       [0.26939584],
       [0.068

In [93]:
import numpy as np
import pandas as pd

def create_synthetic_data():
    """
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df
    """
    # Parameters
    num_patients = 500
    num_genes = 300
    num_key_genes = 20  # Number of genes particularly relevant to the disease
    num_pathways_per_level = [150, 50, 20, 10]
    num_key_pathways = 20  # Number of pathways particularly relevant to the disease

    # Simulate gene expression data
    gene_expression = np.random.rand(num_patients, num_genes)
    #gene_expression_noisy = gene_expression

    # Introduce noise
    gene_expression_noise = np.random.normal(0, 2, gene_expression.shape)
    gene_expression_noisy = gene_expression + gene_expression_noise
    
    
    # Introduce correlated noise for a subset of genes
    correlated_genes_indices = np.random.choice(num_genes, 200, replace=False)
    correlated_noise = np.random.normal(0, 2, (num_patients, 1))
    gene_expression_noisy[:, correlated_genes_indices] += correlated_noise
    gene_expression_noisy = np.where(gene_expression_noisy < 0, 0, gene_expression_noisy)

    # Select key genes
    key_genes = np.random.choice(num_genes, num_key_genes, replace=False)
    
    for key_gene in key_genes:
        gene_expression_noisy[:, key_gene] *= 2

    pathways = {}
    key_pathways = set()
    parent_child_pairs = []

    for level in range(4):
        for i in range(num_pathways_per_level[level]):
            pathway_name = f'Level_{level}Pathway{i}'
            if level == 0:
                pathway_genes = np.random.choice(num_genes, size=np.random.randint(5, 10), replace=False)
                pathways[pathway_name] = pathway_genes
            else:
                # Choose a random number of parents (for example, between 1 and 3)
                num_parents = np.random.randint(1, 3)
                for _ in range(num_parents):
                    parent_pathway = f'Level_{level-1}Pathway{np.random.randint(num_pathways_per_level[level-1])}'
                    parent_genes = pathways[parent_pathway]
                    num_genes_to_choose = min(len(parent_genes), np.random.randint(3, 10))
                    pathway_genes = np.random.choice(parent_genes, size=num_genes_to_choose, replace=False)

                    # Record each parent-child pair
                    parent_child_pairs.append((parent_pathway, pathway_name))

            pathways[pathway_name] = pathway_genes
            if i < num_key_pathways:
                key_pathways.add(pathway_name)

    # Creating DataFrame from parent-child pairs
    parent_child_df = pd.DataFrame(parent_child_pairs, columns=['parent', 'child'])



    # Now, parent_child_df contains the required data
    # Add noise pathways
    num_noise_pathways = 100
    for i in range(num_noise_pathways):
        pathways[f'Noise_Pathway_{i}'] = np.random.choice(num_genes, size=np.random.randint(5, 15), replace=False)

    def nonlinear_transform(expression):
        # Combining multiple mathematical functions for complexity
        return np.sin(expression * np.pi) * np.tanh(expression) + np.cos(expression * np.pi / 2) * np.exp(expression)

    # Function to simulate disease status with additional complexity
    def simulate_disease_status(gene_expression, pathways, key_pathways, key_genes):
        disease_status = []
        for patient in gene_expression:
            status = 0
            for pathway, genes in pathways.items():
                if 'Noise' not in pathway:
                    mean_expression = nonlinear_transform(np.mean(patient[genes]))
                    #print(mean_expression)
                    pathway_influence = 2 if pathway in key_pathways else 1
                    status += pathway_influence if mean_expression > 1 else 0

            key_gene_expression = nonlinear_transform(np.mean(patient[key_genes]))
            key_gene_influence = 30 if key_gene_expression > 1 else -1
            # Adjust overall threshold for disease status
            disease_threshold = len(pathways) / 3  # Adjusted to a fixed value
            # Influence of key genes
            
            status += key_gene_influence
            #print(status)
            disease_status.append(1 if status > disease_threshold else 0)
            
        return disease_status

    # Generate disease status
    disease_status = simulate_disease_status(gene_expression_noisy, pathways, key_pathways, key_genes)


    ######################## All good below ########################
    disease_status = [status + 1 for status in disease_status]
    # Create DataFrame
    df = pd.DataFrame(gene_expression_noisy, columns=[f'Gene_{i}' for i in range(num_genes)])
    df['group'] = disease_status

    qm_matrix_synthetic = df.iloc[:, :-1]
    design_matrix_synthetic = df.iloc[:, [-1]]
    translations_synthetic = pd.DataFrame([(number, pathway) for pathway, numbers in pathways.items() for number in numbers], columns=['input', 'translation'])
    
    # transform qm matrix
    qm_matrix_synthetic = qm_matrix_synthetic.T
    qm_matrix_synthetic.reset_index(inplace=True)
    qm_matrix_synthetic.rename(columns={'index': 'Genes'}, inplace=True)
    qm_matrix_synthetic.columns = ['Protein'] + [f'Patient{i-1}' for i in range(1, len(qm_matrix_synthetic.columns))]
    # transform design_matrix
    design_matrix_synthetic = design_matrix_synthetic.reset_index()
    design_matrix_synthetic.rename(columns={'index': 'sample'}, inplace=True)
    design_matrix_synthetic['sample'] = design_matrix_synthetic['sample'].apply(lambda x: f'Patient{x}')
    # Transorm translation
    #translations_synthetic.rename(columns={'Number': 'Gene'}, inplace=True)
    translations_synthetic["input"] = translations_synthetic["input"].apply(lambda x: f'Gene_{x}')
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df,key_genes,key_pathways
    


qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df,key_genes,key_pathways = create_synthetic_data()


In [94]:
design_matrix_synthetic.value_counts("group")

group
2    274
1    226
Name: count, dtype: int64

In [68]:
np.sin(3 * np.pi) * np.tanh(4) #+ np.cos(expression * np.pi / 2) * np.exp(expression)

3.671476284658843e-16

In [19]:
key_genes = np.random.choice(200, 15, replace=False)
key_genes

array([ 27, 133,  48,  56,   0,  17,  52,  18, 121,  25, 171, 150, 127,
        66,  16])

In [14]:
import numpy as np

# Counting the number of zeros in each column of qm_matrix_synthetic
zero_counts = (qm_matrix_synthetic == 0).sum()

# If you want the total number of zeros in the entire DataFrame
total_zeros = np.sum(zero_counts)

# Printing the result
print("Total number of zeros in qm_matrix_synthetic:", total_zeros)
300 * 501 

Total number of zeros in qm_matrix_synthetic: 62196


150300

In [None]:
import numpy as np
import pandas as pd
def create_synthetic_data():
    """
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df
    """
    # Parameters
    num_patients = 500
    num_genes = 300
    num_key_genes = 10  # Number of genes particularly relevant to the disease
    num_pathways_per_level = [150, 50, 20, 10]
    num_key_pathways = 15  # Number of pathways particularly relevant to the disease

    # Simulate gene expression data
    gene_expression = np.random.rand(num_patients, num_genes)

    # Introduce noise
    gene_expression_noise = np.random.normal(0, 2, gene_expression.shape)
    gene_expression_noisy = gene_expression + gene_expression_noise

    # Introduce correlated noise for a subset of genes
    correlated_genes_indices = np.random.choice(num_genes, 200, replace=False)
    correlated_noise = np.random.normal(0, 2, (num_patients, 1))
    gene_expression_noisy[:, correlated_genes_indices] += correlated_noise

    # Select key genes
    key_genes = np.random.choice(num_genes, num_key_genes, replace=False)

    pathways = {}
    key_pathways = set()
    parent_child_pairs = []

    for level in range(4):
        for i in range(num_pathways_per_level[level]):
            pathway_name = f'Level_{level}Pathway{i}'
            if level == 0:
                pathway_genes = np.random.choice(num_genes, size=np.random.randint(5, 10), replace=False)
            else:
                # Choose a random number of parents (for example, between 1 and 3)
                num_parents = np.random.randint(1, 3)
                for _ in range(num_parents):
                    parent_pathway = f'Level_{level-1}Pathway{np.random.randint(num_pathways_per_level[level-1])}'
                    parent_genes = pathways[parent_pathway]
                    num_genes_to_choose = min(len(parent_genes), np.random.randint(3, 10))
                    pathway_genes = np.random.choice(parent_genes, size=num_genes_to_choose, replace=False)

                    # Record each parent-child pair
                    parent_child_pairs.append((parent_pathway, pathway_name))

            pathways[pathway_name] = pathway_genes
            if i < num_key_pathways:
                key_pathways.add(pathway_name)

    # Creating DataFrame from parent-child pairs
    parent_child_df = pd.DataFrame(parent_child_pairs, columns=['parent', 'child'])



    # Now, parent_child_df contains the required data
    # Add noise pathways
    num_noise_pathways = 100
    for i in range(num_noise_pathways):
        pathways[f'Noise_Pathway_{i}'] = np.random.choice(num_genes, size=np.random.randint(5, 15), replace=False)

    def nonlinear_transform(expression):
        # Combining multiple mathematical functions for complexity
        return np.sin(expression * np.pi) * np.tanh(expression) + np.cos(expression * np.pi / 2) * np.exp(expression)

    # Function to simulate disease status with additional complexity
    def simulate_disease_status(gene_expression, pathways, key_pathways):
        disease_status = []
        for patient in gene_expression:
            status = 0
            for pathway, genes in pathways.items():
                if 'Noise' not in pathway:
                    mean_expression = nonlinear_transform(np.mean(patient[genes]))
                    pathway_influence = 4 if pathway in key_pathways else 1
                    status += pathway_influence if mean_expression > 1 else 0

            # Adjust overall threshold for disease status
            disease_threshold = len(pathways) / 3  # Adjusted to a fixed value
            disease_status.append(1 if status > disease_threshold else 0)
        return disease_status


    # Generate disease status
    disease_status = simulate_disease_status(gene_expression_noisy, pathways, key_pathways)

    # Create DataFrame
    df = pd.DataFrame(gene_expression_noisy, columns=[f'Gene_{i}' for i in range(num_genes)])
    df['Disease_Status'] = disease_status

    qm_matrix_synthetic = df.iloc[:, :-1]
    design_matrix_synthetic = df.iloc[:, [-1]]
    translations_synthetic = pd.DataFrame([(number, pathway) for pathway, numbers in pathways.items() for number in numbers], columns=['Number', 'Pathway'])
    
    # transform qm matrix
    qm_matrix_synthetic = qm_matrix_synthetic.T
    qm_matrix_synthetic.reset_index(inplace=True)
    qm_matrix_synthetic.rename(columns={'index': 'Genes'}, inplace=True)
    qm_matrix_synthetic.columns = ['Genes'] + [f'Patient{i-1}' for i in range(1, len(qm_matrix_synthetic.columns))]
    # transform design_matrix
    design_matrix_synthetic = design_matrix_synthetic.reset_index()
    design_matrix_synthetic.rename(columns={'index': 'Patient'}, inplace=True)
    design_matrix_synthetic['Patient'] = design_matrix_synthetic['Patient'].apply(lambda x: f'Patient{x}')
    # Transorm translation
    translations_synthetic.rename(columns={'Number': 'Gene'}, inplace=True)
    translations_synthetic["Gene"] = translations_synthetic["Gene"].apply(lambda x: f'Gene_{x}')
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df


"""
Old code
# Create hierarchical pathways
    pathways = {}
    key_pathways = set()
    for level in range(4):
        for i in range(num_pathways_per_level[level]):
            pathway_name = f'Level_{level}Pathway{i}'
            if level == 0:
                pathway_genes = np.random.choice(num_genes, size=np.random.randint(5, 10), replace=False)
            else:
                parent_pathway = f'Level_{level-1}Pathway{np.random.randint(num_pathways_per_level[level-1])}'
                parent_genes = pathways[parent_pathway]
                num_genes_to_choose = min(len(parent_genes), np.random.randint(3, 10))
                pathway_genes = np.random.choice(parent_genes, size=num_genes_to_choose, replace=False)

            pathways[pathway_name] = pathway_genes
            if i < num_key_pathways:
                key_pathways.add(pathway_name)
            """

    """pathways = {}
    key_pathways = set()
    parent_child_pairs = []

    for level in range(4):
        for i in range(num_pathways_per_level[level]):
            pathway_name = f'Level_{level}Pathway{i}'
            if level == 0:
                pathway_genes = np.random.choice(num_genes, size=np.random.randint(5, 10), replace=False)
            else:
                parent_pathway = f'Level_{level-1}Pathway{np.random.randint(num_pathways_per_level[level-1])}'
                parent_genes = pathways[parent_pathway]
                num_genes_to_choose = min(len(parent_genes), np.random.randint(3, 10))
                pathway_genes = np.random.choice(parent_genes, size=num_genes_to_choose, replace=False)

                # Record the parent-child pair
                parent_child_pairs.append((parent_pathway, pathway_name))

            pathways[pathway_name] = pathway_genes
            if i < num_key_pathways:
                key_pathways.add(pathway_name)



    # Creating DataFrame from parent-child pairs
    parent_child_df = pd.DataFrame(parent_child_pairs, columns=['Parent', 'Child'])"""

In [None]:
# Parameters
num_patients = 500
num_genes = 300
num_key_genes = 10  # Number of genes particularly relevant to the disease
num_pathways_per_level = [150, 50, 20, 10]
num_key_pathways = 15  # Number of pathways particularly relevant to the disease

# Simulate gene expression data
gene_expression = np.random.rand(num_patients, num_genes)

# Introduce noise
gene_expression_noise = np.random.normal(0, 2, gene_expression.shape)
gene_expression_noisy = gene_expression + gene_expression_noise

# Introduce correlated noise for a subset of genes
correlated_genes_indices = np.random.choice(num_genes, 200, replace=False)
correlated_noise = np.random.normal(0, 2, (num_patients, 1))
gene_expression_noisy[:, correlated_genes_indices] += correlated_noise

# Select key genes
key_genes = np.random.choice(num_genes, num_key_genes, replace=False)

# Create hierarchical pathways
pathways = {}
key_pathways = set()
for level in range(4):
    for i in range(num_pathways_per_level[level]):
        pathway_name = f'Level_{level}Pathway{i}'
        if level == 0:
            pathway_genes = np.random.choice(num_genes, size=np.random.randint(5, 10), replace=False)
        else:
            parent_pathway = f'Level_{level-1}Pathway{np.random.randint(num_pathways_per_level[level-1])}'
            parent_genes = pathways[parent_pathway]
            num_genes_to_choose = min(len(parent_genes), np.random.randint(3, 10))
            pathway_genes = np.random.choice(parent_genes, size=num_genes_to_choose, replace=False)

        pathways[pathway_name] = pathway_genes
        if i < num_key_pathways:
            key_pathways.add(pathway_name)

# Add noise pathways
num_noise_pathways = 100
for i in range(num_noise_pathways):
    pathways[f'Noise_Pathway_{i}'] = np.random.choice(num_genes, size=np.random.randint(5, 15), replace=False)

def nonlinear_transform(expression):
    # Combining multiple mathematical functions for complexity
    return np.sin(expression * np.pi) * np.tanh(expression) + np.cos(expression * np.pi / 2) * np.exp(expression)

# Function to simulate disease status with additional complexity
def simulate_disease_status(gene_expression, pathways, key_pathways):
    disease_status = []
    for patient in gene_expression:
        status = 0
        for pathway, genes in pathways.items():
            if 'Noise' not in pathway:
                mean_expression = nonlinear_transform(np.mean(patient[genes]))
                pathway_influence = 4 if pathway in key_pathways else 1
                status += pathway_influence if mean_expression > 1 else 0

        # Adjust overall threshold for disease status
        disease_threshold = len(pathways) / 3  # Adjusted to a fixed value
        disease_status.append(1 if status > disease_threshold else 0)
    return disease_status


# Generate disease status
disease_status = simulate_disease_status(gene_expression_noisy, pathways, key_pathways)

# Create DataFrame
df = pd.DataFrame(gene_expression_noisy, columns=[f'Gene_{i}' for i in range(num_genes)])
df['Disease_Status'] = disease_status

In [None]:
def create_synthetic_data():
    """
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df
    """
    # Parameters
    num_patients = 500
    num_genes = 300
    num_key_genes = 30  # Number of genes particularly relevant to the disease
    num_pathways_per_level = [150, 50, 20, 10]
    num_key_pathways = 15  # Number of pathways particularly relevant to the disease

    # Simulate gene expression data
    gene_expression = np.random.rand(num_patients, num_genes)

    # Introduce noise
    gene_expression_noise = np.random.normal(0, 2, gene_expression.shape)
    gene_expression_noisy = gene_expression + gene_expression_noise
    
    
    # Introduce correlated noise for a subset of genes
    correlated_genes_indices = np.random.choice(num_genes, 200, replace=False)
    correlated_noise = np.random.normal(0, 2, (num_patients, 1))
    gene_expression_noisy[:, correlated_genes_indices] += correlated_noise
    gene_expression_noisy = np.where(gene_expression_noisy < 0, 0, gene_expression_noisy)
    # Select key genes
    key_genes = np.random.choice(num_genes, num_key_genes, replace=False)
    
    for key_gene in key_genes:
        gene_expression_noisy[:, key_gene] *= 2

    pathways = {}
    key_pathways = set()
    parent_child_pairs = []

    for level in range(4):
        for i in range(num_pathways_per_level[level]):
            pathway_name = f'Level_{level}Pathway{i}'
            if level == 0:
                pathway_genes = np.random.choice(num_genes, size=np.random.randint(5, 10), replace=False)
            else:
                # Choose a random number of parents (for example, between 1 and 3)
                num_parents = np.random.randint(1, 3)
                for _ in range(num_parents):
                    parent_pathway = f'Level_{level-1}Pathway{np.random.randint(num_pathways_per_level[level-1])}'
                    parent_genes = pathways[parent_pathway]
                    num_genes_to_choose = min(len(parent_genes), np.random.randint(3, 10))
                    pathway_genes = np.random.choice(parent_genes, size=num_genes_to_choose, replace=False)

                    # Record each parent-child pair
                    parent_child_pairs.append((parent_pathway, pathway_name))

            pathways[pathway_name] = pathway_genes
            if i < num_key_pathways:
                key_pathways.add(pathway_name)

    # Creating DataFrame from parent-child pairs
    parent_child_df = pd.DataFrame(parent_child_pairs, columns=['parent', 'child'])



    # Now, parent_child_df contains the required data
    # Add noise pathways
    num_noise_pathways = 100
    for i in range(num_noise_pathways):
        pathways[f'Noise_Pathway_{i}'] = np.random.choice(num_genes, size=np.random.randint(5, 15), replace=False)

    def nonlinear_transform(expression):
        # Combining multiple mathematical functions for complexity
        return np.sin(expression * np.pi) * np.tanh(expression) + np.cos(expression * np.pi / 2) * np.exp(expression)

    # Function to simulate disease status with additional complexity
    """def simulate_disease_status(gene_expression, pathways, key_pathways):
        disease_status = []
        for patient in gene_expression:
            status = 0
            for pathway, genes in pathways.items():
                if 'Noise' not in pathway:
                    mean_expression = nonlinear_transform(np.mean(patient[genes]))
                    pathway_influence = 4 if pathway in key_pathways else 1
                    status += pathway_influence if mean_expression > 1 else 0

            # Adjust overall threshold for disease status
            disease_threshold = len(pathways) / 3  # Adjusted to a fixed value
            disease_status.append(1 if status > disease_threshold else 0)
            
        return disease_status"""
    def simulate_disease_status(gene_expression, pathways, key_pathways, key_genes):
        disease_status = []
        for patient in gene_expression:
            status = 0
            key_gene_expression = np.mean(patient[key_genes])
            
            for pathway, genes in pathways.items():
                if 'Noise' not in pathway:
                    mean_expression = nonlinear_transform(np.mean(patient[genes]))
                    pathway_influence = 4 if pathway in key_pathways else 1
                    status += pathway_influence if mean_expression > 1 else 0

            # Adjust overall threshold for disease status
            disease_threshold = len(pathways) / 3  # Adjusted to a fixed value

            # Influence of key genes
            key_gene_influence = 2 if key_gene_expression > 1 else -1
            status += key_gene_influence

            disease_status.append(1 if status > disease_threshold else 0)
            
        return disease_status

    # Generate disease status
    disease_status = simulate_disease_status(gene_expression_noisy, pathways, key_pathways, key_genes)
    disease_status = [status + 1 for status in disease_status]
    # Create DataFrame
    df = pd.DataFrame(gene_expression_noisy, columns=[f'Gene_{i}' for i in range(num_genes)])
    df['group'] = disease_status

    qm_matrix_synthetic = df.iloc[:, :-1]
    design_matrix_synthetic = df.iloc[:, [-1]]
    translations_synthetic = pd.DataFrame([(number, pathway) for pathway, numbers in pathways.items() for number in numbers], columns=['input', 'translation'])
    
    # transform qm matrix
    qm_matrix_synthetic = qm_matrix_synthetic.T
    qm_matrix_synthetic.reset_index(inplace=True)
    qm_matrix_synthetic.rename(columns={'index': 'Genes'}, inplace=True)
    qm_matrix_synthetic.columns = ['Protein'] + [f'Patient{i-1}' for i in range(1, len(qm_matrix_synthetic.columns))]
    # transform design_matrix
    design_matrix_synthetic = design_matrix_synthetic.reset_index()
    design_matrix_synthetic.rename(columns={'index': 'sample'}, inplace=True)
    design_matrix_synthetic['sample'] = design_matrix_synthetic['sample'].apply(lambda x: f'Patient{x}')
    # Transorm translation
    #translations_synthetic.rename(columns={'Number': 'Gene'}, inplace=True)
    translations_synthetic["input"] = translations_synthetic["input"].apply(lambda x: f'Gene_{x}')
    return qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df,key_genes,key_pathways
    
