In [1]:
import numpy as np
import pandas as pd

from collections import OrderedDict
from datetime import datetime, timedelta
from typing import Any, Iterable

In [2]:
def random_date_str(N: int=1, datefmt: str='%Y-%m-%d') -> np.array:
    """ Return N random date strings.
    """
    first = datetime.strptime('2000-01-01', '%Y-%m-%d')
    last = datetime.strptime('2020-01-01', '%Y-%m-%d')
    
    # Datetime diff
    datediff = last - first
    
    # Diff in seconds
    secdiff = datediff.days*24*3600 + datediff.seconds
    
    # Generate random seconds in range
    randsecs = np.random.randint(low=0, high=secdiff+1, size=N)
    
    # Generate date strings
    resdates = []
    for idx, seconds in enumerate(randsecs):
        # New date
        rand_date = first + timedelta(seconds=int(seconds))
        resdates.append(rand_date.strftime(datefmt))
    
    return np.array(resdates)

In [3]:
def random_category(categories: Iterable[Any], N: int=1) -> np.array:
    """ Pick random elements from the supplied list
    """
    choices = np.random.randint(low=0, high=len(categories), size=N)
    chosen = []
    for choice in choices:
        chosen.append(categories[choice])
    
    return np.array(chosen)

In [4]:
random_category(['Low', 'Medium', 'High', ''], N=5)

array(['Low', 'High', '', 'High', ''], dtype='<U4')

In [5]:
def random_floats(size, nan_frac=0.2, low=0, high=100):
    """ Random floats with nan_frac nans
    """
    n_nans = int(nan_frac*size)
    replace_nan = np.sort(np.random.choice(np.arange(size), size=n_nans))
    
    v = np.random.randint(low=low, high=high, size=size).astype(np.float64)
    v[replace_nan] = np.nan
    
    return v

In [6]:
random_floats(10)

array([97., 14., 39., 96., 11., 60., nan, nan,  1., 19.])

In [7]:
# Construct test data
data_len = 100

levels = ['Low', 'Medium', 'High', 'Critical', '']
settings = ['stun', 'heat', 'disintegrate', 'field burst', 'luvetric pulse', 'expanding energy pulse', 'proximity blast', '']

data = OrderedDict()
data['date'] = random_date_str(data_len)
data['level'] = random_category(levels, data_len)
data['setting'] = random_category(settings, data_len)
data['health'] = random_floats(data_len)

In [8]:
df = pd.DataFrame(data)
df

Unnamed: 0,date,level,setting,health
0,2019-10-28,Medium,,53.0
1,2007-04-24,High,proximity blast,27.0
2,2001-03-21,Critical,expanding energy pulse,55.0
3,2019-08-03,,field burst,
4,2013-11-13,High,proximity blast,27.0
...,...,...,...,...
95,2012-12-26,Low,proximity blast,78.0
96,2008-03-31,Medium,proximity blast,20.0
97,2004-07-05,,stun,
98,2000-01-18,Low,stun,93.0


In [9]:
df.isnull().sum()

date        0
level       0
setting     0
health     19
dtype: int64

In [10]:
#df.to_csv('../../dataworks/tests/testdata/testdata.csv')