In [1]:
import numpy as np
import pandas as pd

from collections import OrderedDict
from datetime import datetime, timedelta
from typing import Any, Iterable

In [2]:
def random_date_str(N: int=1, datefmt: str='%Y-%m-%d') -> np.array:
    """ Return N random date strings.
    """
    first = datetime.strptime('2000-01-01', '%Y-%m-%d')
    last = datetime.strptime('2020-01-01', '%Y-%m-%d')
    
    # Datetime diff
    datediff = last - first
    
    # Diff in seconds
    secdiff = datediff.days*24*3600 + datediff.seconds
    
    # Generate random seconds in range
    randsecs = np.random.randint(low=0, high=secdiff+1, size=N)
    
    # Generate date strings
    resdates = []
    for idx, seconds in enumerate(randsecs):
        # New date
        rand_date = first + timedelta(seconds=int(seconds))
        resdates.append(rand_date.strftime(datefmt))
    
    return np.array(resdates)

In [3]:
def random_category(categories: Iterable[Any], N: int=1) -> np.array:
    """ Pick random elements from the supplied list
    """
    choices = np.random.randint(low=0, high=len(categories), size=N)
    chosen = []
    for choice in choices:
        chosen.append(categories[choice])
    
    return np.array(chosen)

In [4]:
random_category(['Low', 'Medium', 'High', ''], N=5)

array(['High', 'Medium', '', '', 'Medium'], dtype='<U6')

In [5]:
def random_floats(size, nan_frac=0.2, low=0, high=100):
    """ Random floats with nan_frac nans
    """
    n_nans = int(nan_frac*size)
    replace_nan = np.sort(np.random.choice(np.arange(size), size=n_nans))
    
    v = np.random.randint(low=low, high=high, size=size).astype(np.float64)
    v[replace_nan] = np.nan
    
    return v

In [6]:
random_floats(10)

array([29., nan,  3., 71., 61., 26., nan,  3., 53., 39.])

In [7]:
# Construct test data
data_len = 100

levels = ['Low', 'Medium', 'High', 'Critical', '']
settings = ['stun', 'heat', 'disintegrate', 'field burst', 'luvetric pulse', 'expanding energy pulse', 'proximity blast', '']

data = OrderedDict()
data['date'] = random_date_str(data_len)
data['level'] = random_category(levels, data_len)
data['setting'] = random_category(settings, data_len)
data['health'] = random_floats(data_len)
data['xp'] = random_floats(data_len, low=100, high=1000)

In [8]:
df = pd.DataFrame(data)
df

Unnamed: 0,date,level,setting,health,xp
0,2002-05-23,Critical,stun,12.0,755.0
1,2016-03-05,Critical,,,620.0
2,2004-01-11,Critical,proximity blast,61.0,470.0
3,2011-01-02,Medium,proximity blast,30.0,133.0
4,2008-08-08,Critical,expanding energy pulse,42.0,564.0
5,2001-01-25,High,luvetric pulse,20.0,169.0
6,2015-06-12,,stun,20.0,637.0
7,2015-10-23,Medium,,27.0,
8,2016-12-22,Critical,field burst,,852.0
9,2012-12-28,High,luvetric pulse,25.0,878.0


In [9]:
df.isnull().sum()

date        0
level       0
setting     0
health     18
xp         19
dtype: int64

In [10]:
df.to_csv('../../dataworks/tests/testdata/testdata.csv')