Tools
- [Copulas](https://sdv.dev/Copulas/index.html)
- Numpy


In [2]:
%pip install copulas numpy pandas matplotlib psycopg

Collecting copulas
  Using cached copulas-0.12.1-py3-none-any.whl.metadata (9.4 kB)
Collecting plotly>=5.10.0 (from copulas)
  Downloading plotly-6.0.0-py3-none-any.whl.metadata (5.6 kB)
Collecting scipy>=1.9.2 (from copulas)
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting narwhals>=1.15.1 (from plotly>=5.10.0->copulas)
  Downloading narwhals-1.27.1-py3-none-any.whl.metadata (10 kB)
Downloading copulas-0.12.1-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.5/52.5 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading plotly-6.0.0-py3-none-any.whl (14.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_6

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io


### Template to generate data using normal distribution

In [3]:
mean = 0          # Mean (μ)
std_dev = 1       # Standard deviation (σ)
sample_size = 1000

data = np.random.normal(loc=mean, scale=std_dev, size=sample_size)

print(data[:10])

[125.13896329  83.33549345  88.45064363  91.34173322 112.79441006
  89.05217455  73.71674475  62.18200157  82.53847126  93.27803989]


In [None]:
mean = 100
std_dev = 15
num_samples = 1000
time = pd.date_range(start='2025-01-01', periods=num_samples, freq='D')
time = time.strftime('%Y-%m-%d %H:%M:%S')

synthetic_glucose = np.random.normal(loc=mean, scale=std_dev, size=num_samples)
df = pd.DataFrame({'subject_id': 1, 'data_type': 1,'datetime': time, 'reading': synthetic_glucose})

# plt.figure(figsize=(10, 6))
# plt.plot(df'Date'] label='Glucose Levels')
# plt.title('Glucose Level Time Series')
# plt.xlabel('Date')
# plt.ylabel('Glucose Level (mg/dL)')
# plt.grid(True)
# plt.xticks(rotation=45)
# plt.legend()
# plt.show()

print(df.head(10))

   subject_id             datetime     reading
0           1  2025-01-01 00:00:00  101.709086
1           1  2025-01-02 00:00:00   83.901030
2           1  2025-01-03 00:00:00   96.980613
3           1  2025-01-04 00:00:00  103.326948
4           1  2025-01-05 00:00:00   94.593801
5           1  2025-01-06 00:00:00  109.057325
6           1  2025-01-07 00:00:00  100.794202
7           1  2025-01-08 00:00:00   89.871521
8           1  2025-01-09 00:00:00   88.848135
9           1  2025-01-10 00:00:00   97.995166


In [30]:
data = pd.DataFrame({
    2: [57, 59, 57, 59, 57, 59, 57, 59],
    4: [54, 87, 54, 67, 54, 77, 54, 97],
    6: [150, 152, 150, 152, 150, 152, 150, 152]
})

print(data.head())

num_samples = 1000
synthetic_two = np.random.normal(data[2].mean(), data[2].std(), num_samples)
synthetic_four = np.random.normal(data[4].mean(), data[4].std(), num_samples)
synthetic_six = np.random.normal(data[6].mean(), data[6].std(), num_samples)


synthetic_data = pd.DataFrame({
    2: synthetic_two,
    4: synthetic_four,
    6: synthetic_six
})

print(synthetic_data.head())


    2   4    6
0  57  54  150
1  59  87  152
2  57  54  150
3  59  67  152
4  57  54  150
           2          4           6
0  58.195546  70.587374  150.163420
1  57.627387  97.562190  150.944499
2  57.605791  66.619120  149.768284
3  59.045829  76.483731  152.818268
4  61.497426  74.915604  150.184029


### Data points
- Glucose:  90 mg/dL (µ)  15 mg/dL (σ)
- Daily Step Counts:  8000 steps/day (µ)  3000 steps/day (σ)

In [None]:
CREATE TABLE IF NOT EXISTS  readings (
    id SERIAL PRIMARY KEY, 
    subject_id INT, 
    data_type INT, 
    datetime TIMESTAMP, 
    reading DOUBLE PRECISION, 
);

In [None]:
import psycopg
from psycopg import sql

conn = psycopg.connect("dbname=test user=postgres password=secret")
cur = conn.cursor()

# Data to insert
data = [
    (1, 'Alice', 'Manager'), 
    (2, 'Bob', 'Developer'), 
    (3, 'Charlie', 'Designer')
]

# Using executemany for bulk insert
cur.executemany("INSERT INTO employees (id, name, position) VALUES (%s, %s, %s)", data)

conn.commit()
cur.close()
conn.close()


db_params = {
    'dbname': 'your_database', 
    'user': 'your_user', 
    'password': 'your_password', 
    'host': 'localhost', 
    'port': 5432,         
}


try:
    conn = psycopg.connect(**db_params)
    cursor = conn.cursor()

    create_table_query = '''
    CREATE TABLE IF NOT EXISTS employees (
        id SERIAL PRIMARY KEY, 
        name VARCHAR(100), 
        position VARCHAR(100), 
        hire_date DATE
    );
    '''
    
    cursor.execute(create_table_query)
    conn.commit()

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    if cursor:
        cursor.close()
    if conn:
        conn.close()

In [None]:
buffer = io.StringIO()
df.to_csv(buffer, index=False, header=False)
buffer.seek(0)
cursor.copy_from(file=buffer, table=table_name, sep=",", null="")

In [None]:
docker run -it --rm --name init-script -v "$PWD":/usr/src/app -w /usr/src/app python:3.11 python script_to_run.py
docker run -it --rm --network superset_default -v "$PWD":/usr/src/app -w /usr/src/app python:3.11 /bin/bash -c "pip install numpy pandas psycopg[binary] && python myscript.py"

### Generate Synthetic Data using copulas

In [29]:
from copulas.multivariate import GaussianMultivariate

data = pd.DataFrame({
    2: [57, 59, 57, 59, 57, 59, 57, 59],
    4: [54, 87, 54, 67, 54, 77, 54, 97],
    6: [150, 152, 150, 152, 150, 152, 150, 152]
})

print(data.head())

copula = GaussianMultivariate()
copula.fit(data)

num_samples = 1000

synthetic_data = copula.sample(num_samples)
synthetic_data.head()

    2   4    6
0  57  54  150
1  59  87  152
2  57  54  150
3  59  67  152
4  57  54  150


  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
 improvement from the last ten iterations.
  a, b = optimize.fsolve(func, (1.0, 1.0))
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
 improvement from the last ten iterations.
  a, b = optimize.fsolve(func, (1.0, 1.0))


Unnamed: 0,2,4,6
0,58.791235,62.072673,151.791201
1,58.198486,70.432915,151.199428
2,56.952624,52.647679,149.953113
3,57.442451,61.062875,150.4434
4,59.143426,77.779518,152.143467
