# 1. Imports

In [1]:
import sys
import os
import pandas as pd
from IPython.display import display


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

from Seq_Sim.utils.seq_sim_utils import (
    load_config,
    generate_and_save_features
)

# 2. Specify Number of Samples and Fold Change

In [2]:
# specify the number of samples to generate
num_samples = 10

# specify the fold change between the two classes
fold_change = 0.1

# 3. Specify Configuration File Parameters

In [3]:
# config_file = "../config.yml"
# config = load_config(config_file)

# or

config = {
    "log_file": "error.log",
    "data_file_path": "./data/",
    "file_path_to_simulation": "Seq_Sim/seq_sim.py",
    "functions_script_path": "Seq_Sim/utils/seq_sim_utils.py",
    "file_prefix": "sim_data",
    "num_samples": [10, 20, 30],
    "fold_changes": [0.1, 0.75, 1.5, 3],
    "n_threads": 4,
    "dummy_dataset_params": {
        "n_cells": 100,
        "sd_celltypes": 0.1,
        "n_major_cell_types": 7,
        "n_minor_cell_types": 3,
        "relative_abundance": 0.4,
        "n_major_diff_celltypes": 1,
        "n_minor_diff_celltypes": 1,
        "n_batchs": 4,
        "prop_sex": 0.5,
        "prop_disease": 0.5,
        "seed": 1234,
        "n_features": 1000,
    },
    "variance_attributes": {"cluster_ratio": 0.7},
    "ratio_variance": 0.1,
    "column_information": {
        "cluster_col": "cell_type",
        "disease_col": "disease",
        "individual_col": "subject_id",
    },
    "files_to_save": {"feature_matrix": True, "latent_factors": True},
}

# 4. Generate and Save Sequencing Data

In [4]:
try:
    # run the simulation
    generate_and_save_features(num_samples, fold_change, config)

except Exception as e:
    # log the error
    print(e)
    sys.exit(1)

# 5. Ensure files were saved properly

In [5]:
# List all files in the directory specified in the configuration
files = os.listdir(config["data_file_path"])

# Loop through the files and display their content
for file in files:
    file_path = os.path.join(config["data_file_path"], file)

    # Check if the file is a CSV (to avoid errors)
    if file.endswith(".csv"):
        df = pd.read_csv(file_path)

        # Display the first few rows of the DataFrame
        print(f"Preview of {file}:")
        display(df.head())  # Prettier display in Jupyter Notebook
        print("\n")  # Add some spacing between tables

Preview of sim_data_pseudo_feature_num_samples_10_fc_0.1.csv:


Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,...,Feature991,Feature992,Feature993,Feature994,Feature995,Feature996,Feature997,Feature998,Feature999,Feature1000
0,-11942.239893,12.943553,-318.847654,2695.776328,-2139.814545,-717.146778,1539.996661,-4706.707206,23.007149,1162.154929,...,-2224.586927,1495.867963,-2690.879528,37.646723,1435.085427,13271.111625,659.564345,-5542.467489,-3158.84285,4542.571977
1,2439.293515,-10658.214127,4404.059579,1752.411046,-1133.866572,-3194.13965,4731.903754,-5268.52499,7.737832,-5783.376093,...,8555.393323,-5691.212264,4472.74203,30.222789,5634.529341,-3354.225469,-4704.306326,-1173.377176,11995.770742,-1147.780931
2,4031.830996,-1820.018802,-10320.317301,-11446.265464,3671.921902,11080.449494,9256.739749,-5847.214536,-2693.503273,4293.121097,...,-5144.520559,3127.773453,-122.378148,11404.34233,3453.546819,1726.92409,3215.562899,3018.529486,-3643.919678,-3268.666481
3,2237.486889,25.665411,-3193.970434,6601.299042,11282.278936,2098.872337,684.774436,-2661.093197,4053.4368,-2418.361719,...,-3363.379895,-271.982509,-2515.471402,30.658347,-3878.675356,5441.705662,-15459.669432,-3635.07248,-223.881934,-8847.167633
4,21.487903,4117.289188,2853.703536,-14819.088168,3315.687257,1110.804968,5206.448494,1028.313768,-4602.918356,-4420.918656,...,-3415.907445,2830.855292,1928.415578,37.842845,1249.763068,64.976906,-12450.703767,-3026.076038,-345.078986,-9825.684247




Preview of sim_data_latent_data_num_samples_10_fc_0.1.csv:


Unnamed: 0,subject_id,sex,disease,age,batch,bmi,cell_type
0,SUB_3,0,1,18,3,34,E
1,SUB_3,0,1,18,3,34,G
2,SUB_5,1,0,29,1,32,F
3,SUB_7,0,1,21,3,34,G
4,SUB_1,0,0,57,1,16,E




