In [2]:
import numpy as np
import pandas as pd
from dataclasses import dataclass
from sklearn.metrics import accuracy_score
from probability import Generator, Dist

import pprint
pp = pprint.PrettyPrinter()

## Define features
Import the generator functions for the `normal`, `binomial`, and `uniform` distributions.

In [7]:
FEATURES = ['height', 'weight', 'bark_days', 'ear_head_ratio']

Since the features follow different distributions and each one of these has different parameters, we will create a `dataclass` for each one so that we have an easy way of saving parameters.

```python
@dataclass
class my_data_class:
    my_var: str
        
foo = my_data_class(my_var="Hello World")
```

To access the information of `my_var` from `foo`, the syntax `foo.my_var` is used, which should be equal to "Hello World" in this example.

In [8]:
@dataclass
class params_gaussian:
    mu: float
    sigma: float

    def __repr__(self):
        return f"params_gaussian(mu={self.mu:.3f}, sigma={self.sigma:.3f})"
    
@dataclass
class params_binomial:
    n: int
    p: float

    def __repr__(self):
        return f"params_binomial(n={self.n:.3f}, p={self.p:.3f})"
    
@dataclass
class params_uniform:
    a: float
    b: float

    def __repr__(self):
        return f"params_uniform(a={self.a:.3f}, b={self.b:.3f})"

Let's define a dictionary that has information for every breed of dogs.

In [9]:
breed_params = {
    0: {
        'height': params_gaussian(mu=35, sigma=1.5),
        'weight': params_gaussian(mu=20, sigma=1),
        'bark_days': params_binomial(n=30, p=0.8),
        'ear_head_ratio': params_uniform(a=0.6, b=0.1)
    },
    1: {
        'height': params_gaussian(mu=30, sigma=2),
        'weight': params_gaussian(mu=25, sigma=5),
        'bark_days': params_binomial(n=30, p=0.5),
        'ear_head_ratio': params_uniform(a=0.2, b=0.5)
    },
    2: {
        'height': params_gaussian(mu=40, sigma=3.5),
        'weight': params_gaussian(mu=32, sigma=3),
        'bark_days': params_binomial(n=30, p=0.3),
        'ear_head_ratio': params_uniform(a=0.1, b=0.3)
    }
}

## Generating the dataset
With the parameters and distributions for each breed of dogs, let's generate a dataset.

In [10]:
generator = Generator()

In [11]:
def generate_data_for_breed(breed, features, samples, params):
    """
    Generate synthetic data for a specific breed of dogs based on given features and parameters.

    Parameters:
        - breed (str): The breed of the dog for which data is generated.
        - features (list[str]): List of features to generate data for.
        - samples (int): Number of samples to generate for each feature.
        - params (dict): Dictionary containing parameters for each breed and its features.

    Returns:
        - df (pandas.DataFrame): A DataFrame containing the generated synthetic data.
            The DataFrame will have columns for each feature and an additional column for the breed.
    """
    
    df = pd.DataFrame()
    
    for feature in features:
        match feature:
            case "height" | "weight":
                df[feature] = generator.gaussian_generator(params[breed][feature].mu, params[breed][feature].sigma, samples)
                
            case "bark_days":
                df[feature] = generator.binomial_generator(params[breed][feature].n, params[breed][feature].p, samples)
                                       
            case "ear_head_ratio":
                df[feature] = generator.uniform_generator(params[breed][feature].a, params[breed][feature].b, samples)    
    
    df["breed"] = breed
    
    return df

In [12]:
# Generate data for each dog breed
df_0 = generate_data_for_breed(breed=0, features=FEATURES, samples=1200, params=breed_params)
df_1 = generate_data_for_breed(breed=1, features=FEATURES, samples=1350, params=breed_params)
df_2 = generate_data_for_breed(breed=2, features=FEATURES, samples=900, params=breed_params)

# Concatenate all breeds into a single dataframe
df_all_breeds = pd.concat([df_0, df_1, df_2]).reset_index(drop=True)

# Shuffle the data
df_all_breeds = df_all_breeds.sample(frac = 1)

# Print the dataframe
df_all_breeds.head(10)

Unnamed: 0,height,weight,bark_days,ear_head_ratio,breed
2836,39.69781,31.74098,9.0,0.19312,2
1002,36.710641,21.140427,26.0,0.163527,0
1075,34.72693,19.817954,24.0,0.386113,0
1583,32.324884,30.81221,18.0,0.463242,1
248,37.691499,21.794333,28.0,0.11819,0
814,36.688852,21.125901,26.0,0.165052,0
1407,30.844078,27.110196,16.0,0.399051,1
3376,38.616784,30.814387,8.0,0.169269,2
2700,44.655532,35.990456,12.0,0.281653,2
533,35.209095,20.139397,24.0,0.322284,0


## Testing and Training Data
Use 70% of the generated data for training and the remaining 30% for testing.

In [13]:
# Define a 70/30 training/testing split
split = int(len(df_all_breeds)*0.7)

# Do the split
df_train = df_all_breeds[:split].reset_index(drop=True)
df_test = df_all_breeds[split:].reset_index(drop=True)

## Computing parameters out of the training data
1. Compute the estimated parameters of each feature for every breed. The breeds are encoded as integers.
2. Compute the proportion of data belonging to each breed in the training dataset.

In [14]:
def compute_training_params(df, features):
    """
    Computes the estimated parameters for training a model based on the provided dataframe and features.

    Args:
        df (pandas.DataFrame): The dataframe containing the training data.
        features (list): A list of feature names to consider.

    Returns:
        tuple: A tuple containing two dictionaries:
            - params_dict (dict): A dictionary that contains the estimated parameters for each breed and feature.
            - probs_dict (dict): A dictionary that contains the proportion of data belonging to each breed.
    """
    
    # Dict that should contain the estimated parameters
    params_dict = {}
    
    # Dict that should contain the proportion of data belonging to each class
    probs_dict = {}
        
    # Loop over the breeds
    for breed in df["breed"]:
        
        # Slice the original df to only include data for the current breed and the feature columns
        df_breed = df[df["breed"] == breed][features]
        
        # Save the probability of each class (breed) in the probabilities dict
        probs_dict[breed] = round(len(df_breed) / len(df), 3)
        
        # Initialize the inner dict
        inner_dict = {}
        
        # Loop over the columns of the sliced dataframe
        for feature in df_breed.columns:
            match feature:
                case "height" | "weight": 
                    mu = df_breed[feature].mean()
                    sigma = df_breed[feature].std()
                    params = params_gaussian(mu, sigma)
                    
                case "bark_days":
                    n = int(df_breed[feature].max())
                    p = df_breed[feature].mean() / n
                    params = params_binomial(n, p)

                case "ear_head_ratio":
                    a = df_breed[feature].min()
                    b = df_breed[feature].max()
                    params = params_uniform(a, b)
            
            # Save the dataclass object within the inner dict
            inner_dict[feature] = params
        
        # Save inner dict within outer dict
        params_dict[breed] = inner_dict
    
    return params_dict, probs_dict

In [15]:
train_params, train_class_probs = compute_training_params(df_train, FEATURES)

print("Distribution parameters for training split:\n")
pp.pprint(train_params)
print("\nProbability of each class for training split:\n")
pp.pprint(train_class_probs)

Distribution parameters for training split:

{0: {'bark_days': params_binomial(n=30.000, p=0.801),
     'ear_head_ratio': params_uniform(a=0.100, b=0.597),
     'height': params_gaussian(mu=35.030, sigma=1.519),
     'weight': params_gaussian(mu=20.020, sigma=1.013)},
 1: {'bark_days': params_binomial(n=24.000, p=0.622),
     'ear_head_ratio': params_uniform(a=0.201, b=0.500),
     'height': params_gaussian(mu=29.971, sigma=2.011),
     'weight': params_gaussian(mu=24.927, sigma=5.028)},
 2: {'bark_days': params_binomial(n=18.000, p=0.493),
     'ear_head_ratio': params_uniform(a=0.101, b=0.300),
     'height': params_gaussian(mu=39.814, sigma=3.575),
     'weight': params_gaussian(mu=31.841, sigma=3.064)}}

Probability of each class for training split:

{0: 0.346, 1: 0.393, 2: 0.26}


## Compute the probability of X given the breed - $P(x \mid C_i)$

To code a Naive Bayes classifier, you will assume **class-conditional independence** for a given $\boldsymbol x = (x_1, \ldots, x_n)$ in $\boldsymbol X$. With this assumption, you can compute the probability of $x$ given the class using the following expression:

$$\mathbf P(\boldsymbol x \mid C_{i}) = \mathbf P(x_1 \mid C_i) \cdot \mathbf P(x_2 \mid C_i) \cdot \ldots \cdot \mathbf P(x_n \mid C_i) = \prod_{k = 1}^{n} \mathbf P(x_k \mid C_i).$$

In [16]:
dist = Dist()

In [17]:
def prob_of_X_given_C(X, features, breed, params_dict):
    """
    Calculate the conditional probability of X given a specific breed, using the given features and parameters.

    Args:
        X (list): List of feature values for which the probability needs to be calculated.
        features (list): List of feature names corresponding to the feature values in X.
        breed (int): The breed for which the probability is calculated.
        params_dict (dict): Dictionary containing the parameters for different breeds and features.

    Returns:
        float: The conditional probability of X given the specified breed.
    """
    
    if len(X) != len(features):
        print("X and list of features should have the same length")
        return 0
    
    probability = 1.0
    
    for x, feature in zip(X, features):
        
        # Get the relevant parameters from params_dict 
        params = params_dict[breed][feature]

        match feature:
            case "height" | "weight": 
                # Compute the relevant pdf given the distribution and the estimated parameters
                probability_f = dist.pdf_gaussian(x, params.mu, params.sigma)
                
            case "bark_days": 
                # Compute the relevant pdf given the distribution and the estimated parameters
                probability_f = dist.pdf_binomial(x, params.n, params.p)

            case "ear_head_ratio": 
                # Compute the relevant pdf given the distribution and the estimated parameters
                probability_f = dist.pdf_uniform(x, params.a, params.b)
        
        # Multiply by probability of current feature
        probability *= probability_f
    
    return probability

In [18]:
example_dog = df_test[FEATURES].loc[0]
example_breed = df_test[["breed"]].loc[0]["breed"]
print(f"Example dog has breed {example_breed} and features: height = {example_dog['height']:.2f}, weight = {example_dog['weight']:.2f}, bark_days = {example_dog['bark_days']:.2f}, ear_head_ratio = {example_dog['ear_head_ratio']:.2f}\n")

print(f"Probability of these features if dog is classified as breed 0: {prob_of_X_given_C([*example_dog], FEATURES, 0, train_params)}")
print(f"Probability of these features if dog is classified as breed 1: {prob_of_X_given_C([*example_dog], FEATURES, 1, train_params)}")
print(f"Probability of these features if dog is classified as breed 2: {prob_of_X_given_C([*example_dog], FEATURES, 2, train_params)}")

Example dog has breed 1 and features: height = 28.63, weight = 21.56, bark_days = 13.00, ear_head_ratio = 0.27

Probability of these features if dog is classified as breed 0: 7.065771582111443e-11
Probability of these features if dog is classified as breed 1: 0.003942085319420158
Probability of these features if dog is classified as breed 2: 5.770264459662163e-08


## Predict the breed

In [19]:
def predict_breed(X, features, params_dict, probs_dict):
    """
    Predicts the breed based on the input and features.

    Args:
        X (array-like): The input data for prediction.
        features (array-like): The features used for prediction.
        params_dict (dict): A dictionary containing parameters for different breeds.
        probs_dict (dict): A dictionary containing probabilities for different breeds.

    Returns:
        int: The predicted breed index.
    """
    
    posterior_breed_0 = prob_of_X_given_C(X, features, 0, params_dict)*probs_dict[0]
    posterior_breed_1 = prob_of_X_given_C(X, features, 1, params_dict)*probs_dict[1]
    posterior_breed_2 = prob_of_X_given_C(X, features, 2, params_dict)*probs_dict[2]
    
    # Save the breed with the maximum posterior
    prediction = np.argmax([posterior_breed_0, posterior_breed_1, posterior_breed_2])
    
    return prediction

In [20]:
example_pred = predict_breed([*example_dog], FEATURES, train_params, train_class_probs)
print(f"Example dog has breed {example_breed} and Naive Bayes classified it as {example_pred}")

Example dog has breed 1 and Naive Bayes classified it as 1


### Test the classifier on the test data

In [21]:
preds = df_test.apply(lambda x: predict_breed([*x[FEATURES]], FEATURES, train_params, train_class_probs), axis=1)
test_acc = accuracy_score(df_test["breed"], preds)
print(f"Accuracy score for the test split: {test_acc:.2f}")

Accuracy score for the test split: 1.00


The Naive Bayes classifier achieved an accuracy of 100% in the testing data. Nice job! 

You might think that something is wrong when reaching such a high accuracy but in this case it makes sense because the data is generated and you know the true distributions for each feature, real-life data won't have this nice behavior!

For instance, look at the `Email spam detector` example.