# Data Generator

Generates data to create a synthetic dataset to be used in the system. Saves as a CSV file.

This code is supposed to be run once.

In [1]:
import pandas as pd
import random
import numpy as np

In [2]:
# Number of rows of the table
N_ROWS = 1000

# Average height and weight; Standard deviation for height and weight.
# Used to create a random dataset that follows a normal distribution
AVG_HEIGHT = 170
STD_HEIGHT = 20
AVG_WEIGHT = 70
STD_WEIGHT = 15

# OP2 is a random value that has no real meaning. It's just dummy data
# These are the average and standard deviation values for it.
AVG_OP2 = 0
STD_OP2 = 1

# Name of all of the columns in the dataset
col_names = [
    'id', 
    'first_name', 
    'last_name', 
    'age', 
    'height', 
    'weight', 
    'p_type', 
    'status', 
    'op2_level', 
    'inclusion_date'
]

# List of first names
first_names = [
    "Alice",
    "Bob",
    "Charlie",
    "David",
    "Emily",
    "Frank",
    "Grace",
    "Henry",
    "Isabella",
    "Jack",
    "Kate",
    "Liam",
    "Mia",
    "Noah",
    "Olivia",
    "Peter",
    "Quinn",
    "Riley",
    "Sophia",
    "Thomas",
]

# List of last names
last_names = [
    "Smith",
    "Johnson",
    "Williams",
    "Brown",
    "Jones",
    "Miller",
    "Davis",
    "Garcia",
    "Rodriguez",
    "Wilson",
    "Martinez",
    "Anderson",
    "Taylor",
    "Thomas",
    "Harris",
    "Moore",
    "Jackson",
    "White",
    "Thompson",
    "Martin"
]

# List of types for people. Dummy data
types = [
    'ECR',
    'ECC',
    'LAM',
    'AFK',
    None,
]

# Status for a person.
status = [
    'Active',
    'Not Active',
    'Transfering',
    None
]

# Probabilities for the random generator to land on each status value.
status_proba = [0.7, 0.2, 0.05, 0.05]

# List of dates when the person was included.
inclusion_dates = [
    '2024-07-22',
    '2024-07-23',
    '2024-07-24',
    '2024-07-25',
    '2024-07-26',
    '2024-07-27',
    None
]

In [3]:
# For each column, generates random data from the lists above.
first_names_values = [first_names[random.randint(0, len(first_names) - 1)] for _ in range(N_ROWS)]
last_names_values = [last_names[random.randint(0, len(last_names) - 1)] for _ in range(N_ROWS)]
age_values = [random.randint(20, 80) for _ in range(N_ROWS)]
random_heights = np.random.normal(AVG_HEIGHT, STD_HEIGHT, N_ROWS)
height_values = [round(random_heights[i], 2) for i in range(N_ROWS)]
random_weights = np.random.normal(AVG_WEIGHT, STD_WEIGHT, N_ROWS)
weight_values = [round(random_weights[i], 2) for i in range(N_ROWS)]
type_values = [types[random.randint(0, len(types) - 1)] for _ in range(N_ROWS)]
status_values = random.choices(population=status, weights=status_proba, k=N_ROWS)
random_op2_levels = np.random.normal(AVG_OP2 , STD_OP2 , N_ROWS)
op2_level_values = [round(random_op2_levels[i], 2) for i in range(N_ROWS)]
inclusion_date_values = [inclusion_dates[random.randint(0, len(inclusion_dates) - 1)] for _ in range(N_ROWS)]

# Append all the column values in a list.
col_values = []
col_values.append(first_names_values)
col_values.append(last_names_values)
col_values.append(age_values)
col_values.append(height_values)
col_values.append(weight_values)
col_values.append(type_values)
col_values.append(status_values)
col_values.append(op2_level_values)
col_values.append(inclusion_date_values)

In [4]:
def get_dataframe(col_name: str, n_rows: int) -> pd.DataFrame:
    '''
    Returns a initialized data frame containing one column with a list of identifier values and n_rows rows.

    Args:
        col_name (str): name of the column
        n_rows (n_rows): number of rows in the data frame

    Returns:
        pd.DataFrame: a data frame with one column where each value is a unique number
    
    '''

    # List of unique numbers
    values = [i for i in range(n_rows)]

    # Creates a dictionary containing the name of the column (key) and the list of unique numbers (values)
    col = {col_name: values}

    # Returns the dictionary as a pandas DataFrame
    return pd.DataFrame.from_dict(col)

In [6]:
# Generates a new DataFrame
df = get_dataframe(col_names[0], N_ROWS)
df

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
...,...
995,995
996,996
997,997
998,998


In [7]:
# For each item in the list of values, create a new column in the dataframe
for name, values in zip(col_names[1:], col_values):
    df[name] = values

df

Unnamed: 0,id,first_name,last_name,age,height,weight,p_type,status,op2_level,inclusion_date
0,0,Mia,Thomas,37,175.58,56.09,,Active,-0.77,2024-07-24
1,1,Alice,Williams,74,148.65,80.53,ECR,Active,0.88,2024-07-24
2,2,Olivia,Wilson,45,147.71,43.52,LAM,Active,-1.22,2024-07-25
3,3,Noah,Taylor,40,160.54,75.39,AFK,Active,0.71,2024-07-23
4,4,Mia,White,67,195.16,98.76,,Active,-0.03,2024-07-25
...,...,...,...,...,...,...,...,...,...,...
995,995,Kate,Martinez,67,178.33,71.91,,Not Active,-0.61,2024-07-24
996,996,Mia,Taylor,46,132.16,71.79,,Active,-1.66,2024-07-25
997,997,Jack,Moore,66,179.06,83.48,ECR,Active,1.15,2024-07-25
998,998,David,White,59,174.38,70.28,ECR,Not Active,-1.65,2024-07-27


In [7]:
df.head()

Unnamed: 0,id,first_name,last_name,age,height,weight,p_type,status,op2_level,inclusion_date
0,0,Henry,White,45,167.92,58.01,,Active,1.49,2024-07-27
1,1,Bob,Jackson,52,192.63,54.43,AFK,Active,0.05,2024-07-23
2,2,Jack,Williams,49,179.27,91.2,,Active,-0.46,2024-07-24
3,3,Jack,Harris,36,164.18,67.7,ECC,Active,-0.77,
4,4,Noah,Miller,52,151.59,86.56,,,-0.28,2024-07-26


In [8]:
df.tail()

Unnamed: 0,id,first_name,last_name,age,height,weight,p_type,status,op2_level,inclusion_date
995,995,Emily,Wilson,25,171.35,70.33,AFK,Transfering,-0.38,2024-07-27
996,996,Sophia,Garcia,80,159.44,82.64,,Not Active,-0.25,2024-07-26
997,997,Sophia,Martinez,25,154.95,62.57,AFK,Active,0.08,2024-07-24
998,998,Riley,Thompson,80,156.15,75.8,AFK,Not Active,0.1,2024-07-25
999,999,Emily,Garcia,35,155.33,60.87,,Active,0.19,2024-07-25


In [9]:
df.set_index('id')

Unnamed: 0_level_0,first_name,last_name,age,height,weight,p_type,status,op2_level,inclusion_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Henry,White,45,167.92,58.01,,Active,1.49,2024-07-27
1,Bob,Jackson,52,192.63,54.43,AFK,Active,0.05,2024-07-23
2,Jack,Williams,49,179.27,91.20,,Active,-0.46,2024-07-24
3,Jack,Harris,36,164.18,67.70,ECC,Active,-0.77,
4,Noah,Miller,52,151.59,86.56,,,-0.28,2024-07-26
...,...,...,...,...,...,...,...,...,...
995,Emily,Wilson,25,171.35,70.33,AFK,Transfering,-0.38,2024-07-27
996,Sophia,Garcia,80,159.44,82.64,,Not Active,-0.25,2024-07-26
997,Sophia,Martinez,25,154.95,62.57,AFK,Active,0.08,2024-07-24
998,Riley,Thompson,80,156.15,75.80,AFK,Not Active,0.10,2024-07-25


In [10]:
df.to_csv('people.csv', index=False)  