In [1]:
import numpy as np
import pickle
import pandas as pd

In [2]:
seed = 121315
np.random.seed(seed)

### Defining a function for generating regression data 

In [3]:
def generate_dummy_data(n_samples, n_features, pos_important_coefficients, neg_important_coefficients, not_important_coefficients, noise=None, random_state=None):
    """
    Generate synthetic regression data with custom coefficients for features, divided into three categories: 
    - Positive important coefficients (features that contribute positively to the target).
    - Negative important coefficients (features that contribute negatively to the target).
    - Not important coefficients (features that have no effect on the target).
    
    Parameters:
    -----------
    n_samples : int
        The number of data samples to generate (i.e., rows in the feature matrix `X`).
    
    n_features : int
        The total number of features to generate for each sample (i.e., columns in `X`).
    
    pos_important_coefficients : array-like
        An array containing the coefficients for features that positively contribute to the target variable `y`. 
        These coefficients should be positive values.
    
    neg_important_coefficients : array-like
        An array containing the coefficients for features that negatively contribute to the target variable `y`. 
        These coefficients should be negative values.
    
    not_important_coefficients : array-like
        An array of zeros representing features that do not contribute to the target variable `y` (i.e., coefficients with zero value).
    
    noise : float or None, optional (default=None)
        The level of noise to add to the generated target `y`. If specified, random noise scaled by this factor is added to the target variable. If `None`, no noise is added.
    
    random_state : int or None, optional (default=None)
        Random seed used to control reproducibility. If `None`, the random state is not fixed.
    
    Returns:
    --------
    X : numpy.ndarray of shape (n_samples, n_features)
        The generated feature matrix where each row represents a sample and each column represents a feature.
    
    y : numpy.ndarray of shape (n_samples,)
        The target variable calculated as a weighted sum of `X` and the coefficients, with optional noise added.
    
    coefficients : numpy.ndarray of shape (n_features,)
        The combined and shuffled array of coefficients used to generate `y`. 
        This includes the positive, negative, and non-important coefficients.
    
    informative_indices_pos : list
        A list of indices where the corresponding coefficient was positively important (i.e., in the range (0, 1]).
    
    informative_indices_neg : list
        A list of indices where the corresponding coefficient was negatively important (i.e., in the range [-1, 0)).
    
    not_informative_indices : list
        A list of indices where the corresponding coefficient was not important (i.e., exactly 0).
    
    Example:
    --------
    # Generating data with 100 samples and 10 features. 
    # There are 3 positive coefficients, 2 negative, and 5 zero-coefficients.
    
    X, y, coefficients, pos_idx, neg_idx, zero_idx = generate_dummy_data(
        n_samples=100,
        n_features=10,
        pos_important_coefficients=[0.8, 0.6, 0.9],
        neg_important_coefficients=[-0.5, -0.7],
        not_important_coefficients=[0, 0, 0, 0, 0],
        noise=0.1,
        random_state=42
    )
    """
    rng = np.random.RandomState(random_state)
    
    # Generate features
    X = rng.uniform(0, 1, size=(n_samples, n_features))
    
    # Combine coefficients for all groups
    coefficients = np.concatenate((pos_important_coefficients, neg_important_coefficients, not_important_coefficients))
    np.random.shuffle(coefficients)
    
    # Generate target variable
    y = np.dot(X, coefficients) + noise * rng.uniform(0,1,n_samples)
    
    # Initialize arrays to store indices
    informative_indices_pos = []
    informative_indices_neg = []
    not_informative_indices = []
    
    # Loop through coefficients and store indices based on value ranges
    for idx, coef in enumerate(coefficients):
        if 0 < coef <= 1:
            informative_indices_pos.append(idx)
        elif -1 <= coef < 0:
            informative_indices_neg.append(idx)
        elif coef == 0:
            not_informative_indices.append(idx)
    
    return X, y, coefficients, informative_indices_pos, informative_indices_neg, not_informative_indices


### Call the function to generate dummy data

In [4]:
n_samples = 27857
n_features = 86

pos_important_coefficients = np.random.uniform(0.1, 1, 29)
neg_important_coefficients = np.random.uniform(-1, -0.1, 29)
not_important_coefficients = np.zeros(28)

X_custom, y_custom, coef_custom, pos_important_indices, neg_important_indices, not_important_indices = generate_dummy_data(n_samples=n_samples, n_features=n_features, pos_important_coefficients=pos_important_coefficients, neg_important_coefficients=neg_important_coefficients, not_important_coefficients=not_important_coefficients, noise=0.00)

print("Coefficients:", coef_custom)
print("Indices of positive important coefficients:", pos_important_indices)
print("Indices of negative important coefficients:", neg_important_indices)
print("Indices of not important coefficients:", not_important_indices)

Coefficients: [-0.63594305  0.          0.          0.17525386 -0.7779082   0.
 -0.98291919  0.          0.48170357  0.7238502   0.         -0.21442003
 -0.87306332 -0.15549965  0.32973782 -0.22460473 -0.97047831  0.37892692
  0.81156204  0.63564602  0.         -0.16655291 -0.77061304 -0.87683527
  0.17308415  0.          0.          0.         -0.30745003  0.
  0.93122506  0.17187668  0.          0.90455584  0.30156601 -0.55837463
  0.25841627 -0.12286427 -0.50009029  0.          0.92548    -0.44311534
 -0.78587031  0.         -0.37503585  0.          0.          0.
  0.13982232  0.          0.15928465  0.          0.          0.
  0.51518631  0.72449458  0.          0.36992233  0.          0.66507627
  0.         -0.19668351  0.60565399  0.21906226 -0.1481139   0.63921096
 -0.15240682  0.          0.306101   -0.70343462  0.87187095  0.
 -0.37274027  0.86212381 -0.3783923  -0.27720116  0.          0.15689642
 -0.54133773 -0.25543164 -0.29481967  0.          0.18949333 -0.19345734
  0.

### Save dummy data and corresponding coefficients of the linear model

In [5]:
df_coef = pd.DataFrame({'Coefficient': coef_custom, 'Index': range(len(coef_custom))})
df_coef.to_pickle('df_coef.pkl')

In [6]:
import pickle 
with open('coef_array.pkl', 'wb') as f:
    pickle.dump(coef_custom, f)

In [7]:
df = pd.DataFrame(X_custom, columns=[f"feature_{i+1}" for i in range(n_features)])
df['target'] = y_custom
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_78,feature_79,feature_80,feature_81,feature_82,feature_83,feature_84,feature_85,feature_86,target
0,0.211067,0.856305,0.423385,0.933193,0.376651,0.688933,0.778908,0.477010,0.700582,0.096945,...,0.617012,0.353717,0.690478,0.077228,0.548231,0.045505,0.974648,0.263900,0.148638,-0.713859
1,0.108817,0.158638,0.763131,0.366142,0.588922,0.798592,0.056710,0.360544,0.612287,0.837975,...,0.551043,0.338840,0.401196,0.265168,0.102674,0.621692,0.087930,0.846777,0.606630,1.518505
2,0.388005,0.049193,0.225762,0.077234,0.921318,0.945317,0.041298,0.157639,0.582715,0.779889,...,0.180831,0.163115,0.543139,0.802616,0.468819,0.894124,0.550438,0.897475,0.417030,0.629713
3,0.302404,0.115896,0.788347,0.700772,0.651147,0.209887,0.405758,0.193929,0.873774,0.724749,...,0.216003,0.807926,0.423629,0.726326,0.810544,0.736725,0.453691,0.992162,0.980885,0.187191
4,0.471601,0.084490,0.241151,0.787130,0.994776,0.620574,0.883916,0.297099,0.147879,0.756186,...,0.864682,0.427565,0.936642,0.105763,0.917179,0.092923,0.611967,0.062273,0.287476,-0.493460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27852,0.541447,0.076694,0.191292,0.957523,0.655240,0.750578,0.835717,0.801304,0.939088,0.787465,...,0.878235,0.580967,0.749924,0.909567,0.272238,0.988751,0.909523,0.414757,0.888203,1.132753
27853,0.050826,0.658888,0.660791,0.121189,0.589735,0.527047,0.262764,0.334694,0.044825,0.278862,...,0.455876,0.472661,0.948820,0.787206,0.138194,0.797808,0.290750,0.059970,0.060240,-1.876087
27854,0.286700,0.468533,0.089225,0.181216,0.694029,0.274736,0.531953,0.827450,0.603869,0.459552,...,0.695792,0.893947,0.561364,0.710640,0.759805,0.456218,0.099488,0.666659,0.536682,1.340582
27855,0.748162,0.899440,0.600394,0.868040,0.680574,0.130774,0.223466,0.137248,0.012350,0.033816,...,0.188691,0.049275,0.480876,0.841159,0.813381,0.479295,0.362349,0.654077,0.477326,0.047151


In [8]:
df.to_pickle('df_data.pkl')

In [9]:
filename = "positive_correlated_features_idx"
with open(filename, 'wb') as f:
    pickle.dump(pos_important_indices, f)

In [10]:
filename = "negative_correlated_features_idx"
with open(filename, 'wb') as f:
    pickle.dump(neg_important_indices, f)

In [11]:
filename = "not_correlated_features_idx"
with open(filename, 'wb') as f:
    pickle.dump(not_important_indices, f)