In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st

In [2]:
def create_bin_data(size, p=0.5, random_state=None):
    return st.bernoulli(p).rvs(size, random_state=random_state)

def create_cat_data(size, n_cat, p=None, random_state=None):
    if p is None:
        p = np.ones(n_cat) / n_cat
    cat_list = np.arange(1, n_cat + 1)
    return np.random.choice(cat_list, p=p, size=size)

In [3]:
# размер выборки
data_size = 1_000_000

random_state = 432

In [4]:
np.random.seed(random_state)
df = pd.DataFrame()

df['bin1'] = create_bin_data(data_size, p=0.3)
df['bin2'] = create_bin_data(data_size, p=0.5)
df['bin3'] = create_bin_data(data_size, p=0.8)
df['bin4'] = create_bin_data(data_size, p=0.5)

df['rank1'] = create_cat_data(data_size, n_cat=10, p=None)
df['rank2'] = create_cat_data(data_size, n_cat=4, p=[0.1, 0.2, 0.3, 0.4])

sparsed_columns = ['bin2', 'bin3', 'rank2']


meta_features_shape = 8
transform_matrix = np.random.random([len(sparsed_columns), meta_features_shape])
meta_col_names = ['sparsed' + str(i + 1) for i in range(meta_features_shape)]
df[meta_col_names] = (df[sparsed_columns].values @ transform_matrix)

date_len = 20

date_range = np.arange(1, date_len + 1)
seasonal1 = np.sin(date_range * 2 * np.pi / 12).reshape(1, -1)
seasonal2 = np.sin(date_range * 2 * np.pi / 3).reshape(1, -1)
trend = (
    np.random.normal(scale=df['sparsed1'], size=data_size).reshape(-1, 1)
    * (date_range.reshape(1, -1) + df['sparsed2'].values.reshape(-1, 1) / 40)
)

noise = np.random.normal(
    loc=df['sparsed3'].values.reshape(-1, 1),
    scale=df['rank1'].values.reshape(-1, 1),
    size=(data_size, date_len)
)

panel_col_names = ['panel' + str(i + 1) for i in range(date_len)]

df[panel_col_names] = (
    seasonal1 * df['sparsed4'].values.reshape(-1, 1)
    +
    seasonal2 * df['bin1'].values.reshape(-1, 1) * df['sparsed5'].values.reshape(-1, 1)
    +
    trend
    +
    noise
)

decay_coef = 0.8
weighted_panel_treat = sum([
    df['panel' + str(i + 1)] * decay_coef ** (date_len - i)
    for i in range(date_len)
])

df['target'] = (
    weighted_panel_treat
    +
    np.random.normal(
        loc=df['sparsed6'],
        scale=2 + df['bin2'] * 4,
        size=data_size
    )
    + 
    np.random.lognormal(mean=df['sparsed7'], sigma=2, size=data_size)
    - 
    df['bin4'] * np.random.lognormal(mean=df['sparsed8'] * 0.6, sigma=2, size=data_size)
)

df = df.drop(columns=['bin1', 'bin2', 'bin3', 'bin4', 'rank1', 'rank2'])