# Generate some example rare variant data 
We also generate phenotype data correlated with age and sex covariates
```
Requires:
- numpy
- pandas
- scipy
- pathlib
```

In [1]:
import pandas as pd
import numpy as np
from scipy.special import expit
import pathlib

In [2]:
n = 5000
p = 1000

### Generate rare variant data

In [3]:
np.random.seed(1)
maf = 0.01
X = 1*(np.random.rand(n,p)<=maf)
X

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5000, 1000))

### Generate age and sex covariates

In [4]:
np.random.seed(1)
sex = 1*(np.random.rand(n)<=expit(X.dot(0.01*np.ones(p))))
age = np.abs(30 + 5*X.dot(np.ones(p))+ 2.5*np.random.randn(n))

### Generate phenotype

In [5]:
p0_rare_variant = 5

In [6]:
np.random.seed(1)
cov_arr = np.hstack([
    np.asarray(age)[:,np.newaxis], 
    np.asarray(sex)[:,np.newaxis]
])

cov_arr_n = (cov_arr-cov_arr.mean(0))/cov_arr.std(0)
X_n = (X-X.mean(0))/X.std(0)


f_i = X_n[:,:p0_rare_variant].sum(1) + cov_arr_n.sum(1)
offset = -10
y= 1*(np.random.rand(n)<=expit(f_i+offset))
y.sum(), y.mean()

(np.int64(119), np.float64(0.0238))

In [7]:
y_df = pd.Series(y).rename('pheno')
cov_df = pd.DataFrame({'age':age,'sex':sex})
X_df = pd.DataFrame(X, columns=[f'gene_{i}' for i in range(p)])

output_dir = pathlib.Path('synthetic_data')
print('making', output_dir)
output_dir.mkdir(exist_ok=True)

# compression_settings
y_df.to_csv(output_dir / 'y.csv.gz')
print('written to', output_dir / 'y.csv.gz')

X_df.to_csv(output_dir / 'X.csv.gz', compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})
print('written to', output_dir / 'X.csv.gz')

cov_df.to_csv(output_dir / 'cov.csv.gz')
print('written to', output_dir / 'cov.csv.gz')


making synthetic_data
written to synthetic_data/y.csv.gz
written to synthetic_data/X.csv.gz
written to synthetic_data/cov.csv.gz
