In [12]:
import numpy as np
import pandas as pd
from dataclasses import dataclass
from probability import Generator, Dist

## Define features
Import the generator functions for the `normal`, `binomial`, and `uniform` distributions.

In [2]:
FEATURES = ['height', 'weight', 'bark_days', 'ear_head_ratio']

Since the features follow different distributions and each one of these has different parameters, we will create a `dataclass` for each one so that we have an easy way of saving parameters.

```python
@dataclass
class my_data_class:
    my_var: str
        
foo = my_data_class(my_var="Hello World")
```

To access the information of `my_var` from `foo`, the syntax `foo.my_var` is used, which should be equal to "Hello World" in this example.

In [3]:
@dataclass
class params_gaussian:
    mu: float
    sigma: float

    def __repr__(self):
        return f"params_gaussian(mu={self.mu:.3f}, sigma={self.sigma:.3f})"
    
@dataclass
class params_binomial:
    n: int
    p: float

    def __repr__(self):
        return f"params_binomial(n={self.n}, p={self.p:.3f})"
    
@dataclass
class params_uniform:
    a: float
    b: float

    def __repr__(self):
        return f"params_uniform(a={self.a}, b={self.b})"

Let's define a dictionary that has information for every breed of dogs.

In [4]:
breed_params = {
    0: {
        'height': params_gaussian(mu=35, sigma=1.5),
        'weight': params_gaussian(mu=20, sigma=1),
        'bark_days': params_binomial(n=30, p=0.8),
        'ear_head_ratio': params_uniform(a=0.6, b=0.1)
    },
    1: {
        'height': params_gaussian(mu=30, sigma=2),
        'weight': params_gaussian(mu=25, sigma=5),
        'bark_days': params_binomial(n=30, p=0.5),
        'ear_head_ratio': params_uniform(a=0.2, b=0.5)
    },
    2: {
        'height': params_gaussian(mu=40, sigma=3.5),
        'weight': params_gaussian(mu=32, sigma=3),
        'bark_days': params_binomial(n=30, p=0.3),
        'ear_head_ratio': params_uniform(a=0.1, b=0.3)
    }
}

## Generating the dataset
With the parameters and distributions for each breed of dogs, let's generate a dataset.

In [5]:
generator = Generator()

In [6]:
def generate_data_for_breed(breed, features, samples, params):
    """
    Generate synthetic data for a specific breed of dogs based on given features and parameters.

    Parameters:
        - breed (str): The breed of the dog for which data is generated.
        - features (list[str]): List of features to generate data for.
        - samples (int): Number of samples to generate for each feature.
        - params (dict): Dictionary containing parameters for each breed and its features.

    Returns:
        - df (pandas.DataFrame): A DataFrame containing the generated synthetic data.
            The DataFrame will have columns for each feature and an additional column for the breed.
    """
    
    df = pd.DataFrame()
    
    for feature in features:
        match feature:
            case "height" | "weight":
                df[feature] = generator.gaussian_generator(params[breed][feature].mu, params[breed][feature].sigma, samples)
                
            case "bark_days":
                df[feature] = generator.binomial_generator(params[breed][feature].n, params[breed][feature].p, samples)
                                       
            case "ear_head_ratio":
                df[feature] = generator.uniform_generator(params[breed][feature].a, params[breed][feature].b, samples)    
    
    df["breed"] = breed
    
    return df

### Generate data for each breed

In [7]:
df_0 = generate_data_for_breed(breed=0, features=FEATURES, samples=1200, params=breed_params)
df_1 = generate_data_for_breed(breed=1, features=FEATURES, samples=1350, params=breed_params)
df_2 = generate_data_for_breed(breed=2, features=FEATURES, samples=900, params=breed_params)

# Concatenate all breeds into a single dataframe
df_all_breeds = pd.concat([df_0, df_1, df_2]).reset_index(drop=True)

# Shuffle the data
df_all_breeds = df_all_breeds.sample(frac = 1)

# Print the dataframe
df_all_breeds.head(10)

Unnamed: 0,height,weight,bark_days,ear_head_ratio,breed
2836,39.69781,31.74098,9.0,0.19312,2
1002,36.710641,21.140427,26.0,0.163527,0
1075,34.72693,19.817954,24.0,0.386113,0
1583,32.324884,30.81221,18.0,0.463242,1
248,37.691499,21.794333,28.0,0.11819,0
814,36.688852,21.125901,26.0,0.165052,0
1407,30.844078,27.110196,16.0,0.399051,1
3376,38.616784,30.814387,8.0,0.169269,2
2700,44.655532,35.990456,12.0,0.281653,2
533,35.209095,20.139397,24.0,0.322284,0


## Testing and Training Data
Use 70% of the generated data for training and the remaining 30% for testing.

In [8]:
# Define a 70/30 training/testing split
split = int(len(df_all_breeds)*0.7)

# Do the split
df_train = df_all_breeds[:split].reset_index(drop=True)
df_test = df_all_breeds[split:].reset_index(drop=True)