In [33]:
import seaborn as sns
import pandas as pd
import numpy as np

# get data
peng = sns.load_dataset('penguins', cache=True, data_home=None)
print(peng.shape)

# clean data to make continous variables categorical
cont_cols = list( peng.select_dtypes('float64').columns )

# number of levels
levels = 3

for col in cont_cols:
    peng[col] = pd.cut(  peng[col], levels )

(344, 7)


In [34]:
def joint_probs(DF, index, cols ):
    all_cols = index + cols
    N = DF.shape[0]
    
    joint_counts = pd.pivot_table( DF[all_cols] , index = index , columns= cols , aggfunc= 'size' ).replace(np.nan,0)
    
    joint_prob = np.round( joint_counts / N, 3)
    
    return joint_prob

In [35]:
JP = joint_probs(peng, ['bill_length_mm'], ['sex'] )
print(JP,'\n')

sex               Female   Male
bill_length_mm                 
(32.072, 41.267]   0.215  0.142
(41.267, 50.433]   0.253  0.230
(50.433, 59.6]     0.012  0.116 



In [36]:
def cond_prob_dist(joint_probs):
    # P(A | B) = P( A and B ) / P(B)
    ## https://en.wikipedia.org/wiki/Conditional_probability
    
    """
    calculates the conditions prob. distribution where:
    joint_probs: is a joint prob distribution as pandas dataframe
    A = {index   of joint_probs} = {a1, a2, .. an }
    B = {columns of joint_probs} = {b1, b2, .. bn }
    
    
    returns:
    CPD = the conditional probability dist P(A|B) as a pandas dataframe
    """
    
    CPD = joint_probs.copy()

    # column sum
    col_totals = joint_probs.sum(axis=0)
    
    for col in col_totals.index:
        CPD[col] =   CPD[col] / col_totals.loc[col]
        
    # rename columns
    CPD.columns = [ f'b{i+1} = {x}' for i,x in enumerate(CPD.columns) ]
    CPD.index   = [ f'a{i+1} = {x}' for i,x in enumerate(CPD.index) ]
        
    return CPD.round(3)

In [38]:

cd=cond_prob_dist(JP) 
print( cd , '\n')

                       b1 = Female  b2 = Male
a1 = (32.072, 41.267]        0.448      0.291
a2 = (41.267, 50.433]        0.527      0.471
a3 = (50.433, 59.6]          0.025      0.238 



In [52]:
avg=np.mean(cd.iloc[:,1])