In [6]:
import pandas as pd
import numpy as np

class Dataset:
    def __init__ (self, d_size, d_numvars, d_numcats, d_dates, depvar):
        self.d_size = d_size
        self.d_numvars = d_numvars
        self.d_numcats = d_numcats
        self.d_dates = d_dates
        self.depvar = depvar
        
    def gen_dataset(self):
        num_param1 = ['INTEGER', 'FLOAT']
        num_param2 = ['POSITIVE', 'NEGATIVE', 'MIXED']
        num_param3 = ['LT1', 'LOW', 'MEDIUM', 'HIGH', 'HUGE']
        
        cat_param1 = list(range(1, 21))
        cat_param2 = ['NUM', 'CHAR']
        
        dt_param1 = list(range(1947, 2021))
        dt_param2 = ['M', 'D', 'Y']
        
        var_list = []
        
        ## ********************************** ##
        ## **** TARGET VARIABLE CREATION **** ##
        ## ********************************** ##
        
        if self.depvar == 'NUM':
            tp1 = np.random.choice(num_param1)
            tp3 = np.random.choice(num_param3)
            target = Num_var(tp1, 'POSITIVE', tp3, 'TARGET', self.d_size)
            var_list.append(target.generate())
        elif self.depvar == 'CHAR':
            tp2 = np.random.choice(cat_param2)
            target = Cat_var(2, tp2, self.d_size, 'TARGET')
            var_list.append(target.cat_generate())
        
        ## ********************************** ##
        ## *** NUMERIC VARIABLE CREATION **** ##
        ## ********************************** ##
        
        if self.d_numvars != 0:
            for i in range(self.d_numvars):
                np1 = np.random.choice(num_param1)
                np2 = np.random.choice(num_param2)
                np3 = np.random.choice(num_param3)
                ng = Num_var(np1, np2, np3, 'Num_'+str(i+1), self.d_size)
                var_list.append(ng.generate())
        else:
            pass
        
        ## *************************************** ##
        ## **** CATEGORICAL VARIABLE CREATION **** ##
        ## *************************************** ##
        
        if self.d_numcats != 0:
            for i in range(self.d_numcats):
                cp1 = np.random.choice(cat_param1)
                cp2 = np.random.choice(cat_param2)
                cg = Cat_var(cp1, cp2, self.d_size, 'Cat_'+str(i+1))
                var_list.append(cg.cat_generate())
        else:
            pass
        
        ## ******************************** ##
        ## **** DATE VARIABLE CREATION **** ##
        ## ******************************** ##
        
        if self.d_dates != 0:
            for i in range(self.d_dates):
                dp1 = np.random.choice(dt_param1)
                dp2 = np.random.choice(dt_param2)
                dg = Date_var(dp1, dp2, 'Date_'+str(i+1), self.d_size)
                var_list.append(dg.date_generate())
        else:
            pass
        
        if len(var_list) == 2:
            main_dict = {**var_list[0], **var_list[1]}
            df = pd.DataFrame(main_dict)
        elif len(var_list) > 2:
            main_dict = {**var_list[0], **var_list[1]}
            for i in range(2, len(var_list)):
                main_dict = {**main_dict, **var_list[i]}
            df = pd.DataFrame(main_dict)
            
        return df
    
    ################################################################################################
    ################################################################################################
    ### >>>                          GENERATE CATEGORICAL VARIABLES                          <<< ###
    ################################################################################################
    ################################################################################################
    
class Cat_var(Dataset):
    def __init__(self, n_levels, lev_type, c_size, c_name):
        self.n_levels = n_levels
        self.lev_type = lev_type
        self.c_size = c_size
        self.c_name = c_name

    def cat_generate(self):  
        lev_list = []

        # ***** CATEGORICAL LEVEL ENCODED ****** #

        if self.lev_type == 'CHAR':
            for i in range(self.n_levels):
                l = 'LEVEL'+str(i+1)
                lev_list.append(l)
            r = np.random.uniform(0, 100, size=self.n_levels)
            s = sum(r)
            probs = [i/s for i in r]
            cvar = np.random.choice(lev_list, size=self.c_size, p=probs)

        # ******* NUMERIC LEVEL ENCODED ******* #

        elif self.lev_type == 'NUM':
            for i in range(self.n_levels):
                l = i+1
                lev_list.append(l)
            r = np.random.uniform(0, 100, size=self.n_levels)
            s = sum(r)
            probs = [i/s for i in r]
            cvar = np.random.choice(lev_list, size=self.c_size, p=probs)
        return {self.c_name: cvar}
    
    ################################################################################################
    ################################################################################################
    ### >>>                          GENERATE NUMERICAL VARIABLES                            <<< ###
    ################################################################################################
    ################################################################################################

class Num_var(Dataset):
    def __init__ (self, v_type, v_sign, v_range, v_name, n_size):
        self.v_type = v_type
        self.v_sign = v_sign
        self.v_range = v_range
        self.v_name = v_name
        self.n_size = n_size

        ######### ************************* ################### ******************* ##############
        ########                                  LT1                                   ##########
        ######### ************************* ################### ******************* ##############
    def generate(self):
        ####### ******************* ######
        # INTEGER TYPE -- LT1 -- ALL SIGNS
        ###### ******************* ######
        if (self.v_type == 'INTEGER') and (self.v_sign == 'POSITIVE') and (self.v_range == 'LT1'):
            nvar = np.random.randint(0, 1, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'LT1'):
            nvar = np.random.randint(-1, 0, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'MIXED') and (self.v_range == 'LT1'):
            nvar = np.random.randint(-1, 1, size=self.n_size, dtype=int)

        ####### ******************* ####
        # FLOAT TYPE -- LT1 -- ALL SIGNS
        ###### ******************* #####
        if (self.v_type == 'FLOAT') and (self.v_sign == 'POSITIVE') and (self.v_range == 'LT1'):
            nvar = np.random.uniform(0, 1, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'LT1'):
            nvar = np.random.uniform(-1, 0, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'MIXED') and (self.v_range == 'LT1'):
            nvar = np.random.uniform(-1, 1, size=self.n_size)

        ######### ************************* ################### ******************* ##############
        ########                                   LOW                                  ##########
        ######### ************************* ################### ******************* ##############

        ####### ******************* ######
        # INTEGER TYPE -- LOW -- ALL SIGNS
        ###### ******************* ######
        if (self.v_type == 'INTEGER') and (self.v_sign == 'POSITIVE') and (self.v_range == 'LOW'):
            nvar = np.random.randint(0, 100, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'LOW'):
            nvar = np.random.randint(-100, 0, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'MIXED') and (self.v_range == 'LOW'):
            nvar = np.random.randint(-100, 100, size=self.n_size, dtype=int)

        ####### ******************* ####
        # FLOAT TYPE -- LOW -- ALL SIGNS
        ###### ******************* #####
        if (self.v_type == 'FLOAT') and (self.v_sign == 'POSITIVE') and (self.v_range == 'LOW'):
            nvar = np.random.uniform(0, 100, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'LOW'):
            nvar = np.random.uniform(-100, 0, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'MIXED') and (self.v_range == 'LOW'):
            nvar = np.random.uniform(-100, 100, size=self.n_size)

        ######### ************************* ################### ******************* ##############
        ########                                   MEDIUM                               ##########
        ######### ************************* ################### ******************* ##############

        ####### ******************* ######
        # INTEGER TYPE -- MED -- ALL SIGNS
        ###### ******************* ######
        if (self.v_type == 'INTEGER') and (self.v_sign == 'POSITIVE') and (self.v_range == 'MEDIUM'):
            nvar = np.random.randint(100, 5000, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'MEDIUM'):
            nvar = np.random.randint(-5000, -100, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'MIXED') and (self.v_range == 'MEDIUM'):
            nvar = np.random.randint(-5000, 5000, size=self.n_size, dtype=int)

        ####### ******************* ####
        # FLOAT TYPE -- MED -- ALL SIGNS
        ###### ******************* #####
        if (self.v_type == 'FLOAT') and (self.v_sign == 'POSITIVE') and (self.v_range == 'MEDIUM'):
            nvar = np.random.uniform(100, 5000, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'MEDIUM'):
            nvar = np.random.uniform(-5000, -100, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'MIXED') and (self.v_range == 'MEDIUM'):
            nvar = np.random.uniform(-5000, 5000, size=self.n_size)

        ######### ************************* ################### ******************* ##############
        ########                                   HIGH                                 ##########
        ######### ************************* ################### ******************* ##############

        ####### ******************* ######
        # INTEGER TYPE -- MED -- ALL SIGNS
        ###### ******************* ######
        if (self.v_type == 'INTEGER') and (self.v_sign == 'POSITIVE') and (self.v_range == 'HIGH'):
            nvar = np.random.randint(5000, 100000, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'HIGH'):
            nvar = np.random.randint(-100000, -5000, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'MIXED') and (self.v_range == 'HIGH'):
            nvar = np.random.randint(-100000, 100000, size=self.n_size, dtype=int)

        ####### ******************* ####
        # FLOAT TYPE -- MED -- ALL SIGNS
        ###### ******************* #####
        if (self.v_type == 'FLOAT') and (self.v_sign == 'POSITIVE') and (self.v_range == 'HIGH'):
            nvar = np.random.uniform(5000, 100000, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'HIGH'):
            nvar = np.random.uniform(-100000, -5000, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'MIXED') and (self.v_range == 'HIGH'):
            nvar = np.random.uniform(-100000, 100000, size=self.n_size)

        ######### ************************* ################### ******************* ##############
        ########                                   HUGE                                 ##########
        ######### ************************* ################### ******************* ##############

        ####### ******************* ######
        # INTEGER TYPE -- MED -- ALL SIGNS
        ###### ******************* ######
        if (self.v_type == 'INTEGER') and (self.v_sign == 'POSITIVE') and (self.v_range == 'HUGE'):
            nvar = np.random.randint(100000, 10000000, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'HUGE'):
            nvar = np.random.randint(-10000000, -100000, size=self.n_size, dtype=int)
        elif (self.v_type == 'INTEGER') and (self.v_sign == 'MIXED') and (self.v_range == 'HUGE'):
            nvar = np.random.randint(-10000000, 10000000, size=self.n_size, dtype=int)

        ####### ******************* ####
        # FLOAT TYPE -- MED -- ALL SIGNS
        ###### ******************* #####
        if (self.v_type == 'FLOAT') and (self.v_sign == 'POSITIVE') and (self.v_range == 'HUGE'):
            nvar = np.random.uniform(100000, 10000000, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'NEGATIVE') and (self.v_range == 'HUGE'):
            nvar = np.random.uniform(-10000000, -100000, size=self.n_size)
        elif (self.v_type == 'FLOAT') and (self.v_sign == 'MIXED') and (self.v_range == 'HUGE'):
            nvar = np.random.uniform(-10000000, 10000000, size=self.n_size)

        return {self.v_name: nvar}
    
class Date_var(Dataset):
    def __init__(self, start_year, frequency, dt_name, dt_size):
        self.start_year = start_year
        self.frequency = frequency
        self.dt_name = dt_name
        self.dt_size = dt_size
        
    def date_generate(self):
        dtvar = pd.date_range(start='01-01-{}'.format(self.start_year),periods=self.dt_size, freq=self.frequency)
        return {self.dt_name: dtvar}
    
def create_dataset():
    print("*****************************************")
    print("**      CREATING YOUR OWN DATASET      **")
    print("*****************************************")
    print()
    dsize = int(input("=> WHAT SHOULD BE THE SIZE OF YOUR DATASET (HOW MANY OBSERVATIONS) ? "))
    print()
    print("*****************************************")
    print()
    nnum = int(input("=> HOW MANY NUMERIC TYPE VARIABLES DO YOU WANT IN THE DATASET ? "))
    print()
    print("*****************************************")
    print()
    cnum = int(input("=> HOW MANY CHARACTER TYPE VARIABLES DO YOU WANT IN THE DATASET ? "))
    print()
    print("*****************************************")
    print()
    dnum = int(input("=> HOW MANY DATE TYPE VARIABLES DO YOU WANT IN THE DATASET ? "))
    print()
    print("*****************************************")
    print()
    targ = input("=> WHAT KIND OF DEPENDENT VARIABLE DO YOU WANT (type NUM for NUMERIC and CHAR for CHARACTER) ? ")
    print()
    print("*****************************************")
    print()
    
    a = Dataset(dsize, nnum, cnum, dnum, targ)
    return a.gen_dataset()

In [7]:
create_dataset()

*****************************************
**      CREATING YOUR OWN DATASET      **
*****************************************



=> WHAT SHOULD BE THE SIZE OF YOUR DATASET (HOW MANY OBSERVATIONS) ?  30



*****************************************



=> HOW MANY NUMERIC TYPE VARIABLES DO YOU WANT IN THE DATASET ?  4



*****************************************



=> HOW MANY CHARACTER TYPE VARIABLES DO YOU WANT IN THE DATASET ?  2



*****************************************



=> HOW MANY DATE TYPE VARIABLES DO YOU WANT IN THE DATASET ?  1



*****************************************



=> WHAT KIND OF DEPENDENT VARIABLE DO YOU WANT (type NUM for NUMERIC and CHAR for CHARACTER) ?  CHAR



*****************************************



Unnamed: 0,TARGET,Num_1,Num_2,Num_3,Num_4,Cat_1,Cat_2,Date_1
0,2,66,-2388447,8271963.0,199,LEVEL9,6,2016-12-31
1,2,11,-7967239,-1184945.0,-4790,LEVEL10,2,2017-12-31
2,1,15,-2607985,-5923686.0,-3297,LEVEL10,8,2018-12-31
3,2,10,-5887554,-368752.5,-4808,LEVEL2,4,2019-12-31
4,2,94,-669603,-5373524.0,1705,LEVEL2,11,2020-12-31
5,1,98,-3213682,-9859446.0,-1860,LEVEL9,4,2021-12-31
6,1,67,-6190838,-1806631.0,2527,LEVEL12,8,2022-12-31
7,2,7,-2477937,-7635454.0,-700,LEVEL2,9,2023-12-31
8,2,67,-1002385,3152057.0,-4695,LEVEL9,6,2024-12-31
9,2,5,-2655168,-4366594.0,1048,LEVEL3,11,2025-12-31
