In [6]:
%pip install --upgrade pip
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [16]:
from ucimlrepo import fetch_ucirepo 
from pprint import pp
import pandas

pandas.set_option('display.max_rows', None)

# fetch dataset 
mushroom = fetch_ucirepo(id=73)
edibilityCount = mushroom.data.original.groupby('poisonous').size()
featureNames = mushroom.data.features.columns.values
featureCount = len(featureNames)
featureValues = dict()
featurePairs = list()

for i, feature in enumerate(featureNames):
    featureValues[feature] = mushroom.data.features[feature].unique()

for i in range(0, featureCount - 1):
    feature1 = featureNames[i]
    for j in range(i + 1, featureCount):
        feature2 = featureNames[j]
        featurePairs.append([feature1, feature2])

print('features')
print('********************************************')
print(mushroom.metadata['additional_info']['variable_info'])

print()

print('feature pairs')
print('********************************************')
print('\n'.join(f'{i+1}. {featurePair[0]}, {featurePair[1]}' for i, featurePair in enumerate(featurePairs)))

features
********************************************
     1. cap-shape:                bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
     2. cap-surface:              fibrous=f,grooves=g,scaly=y,smooth=s
     3. cap-color:                brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
     4. bruises?:                 bruises=t,no=f
     5. odor:                     almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
     6. gill-attachment:          attached=a,descending=d,free=f,notched=n
     7. gill-spacing:             close=c,crowded=w,distant=d
     8. gill-size:                broad=b,narrow=n
     9. gill-color:               black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
    10. stalk-shape:              enlarging=e,tapering=t
    11. stalk-root:               bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
    12. stalk-surface-above-ring: f

In [8]:
def countFeaturePairs(mushroomData):

    result = list()
    
    def countOf(tally, value1, value2, edibility):
        return (tally[value1][value2][edibility] 
            if value1 in tally
                and value2 in tally[value1]
                and edibility in tally[value1][value2]
            else 0)
    
    for i, featurePair in enumerate(featurePairs):
        feature1 = featurePair[0]
        feature2 = featurePair[1]
        tally = mushroomData.groupby([feature1, feature2, 'poisonous']).size()
        # example groupby result given feature pair: cap-shape, cap-surface
        # (cap-shape values: x, b, s, f, k, c)
        # (cap-surface values: s, y, f, g)
        # {
        #     'x': { 's': { 'e': 8, 'p': 0 } },
        #     'x': { 'y': { 'e': 52, 'p': 378 } },
        #     'x': { 'f': { 'e': 128, 'p': 1 } },
        #     ...
        # }
        
        for v1, value1 in enumerate(featureValues[feature1]):
            for v2, value2 in enumerate(featureValues[feature2]):
                result.append({
                    'feature1_name': feature1,
                    'feature1_value': value1,
                    'feature2_name': feature2,
                    'feature2_value': value2,
                    'edible_count': countOf(tally, value1, value2, 'e'),
                    'poisonous_count': countOf(tally, value1, value2, 'p')
                })
    
    return pandas.DataFrame(result)


def findMinimalSetOfIdentifyingFeaturePairs(edibility):

    if (edibility not in ['e', 'p']):
        raise Exception("edibility must be 'e' or 'p'")

    mushroomData = mushroom.data.original
    minimalSet = pandas.DataFrame(columns=[
    'feature1_name',
    'feature1_value',
    'feature2_name',
    'feature2_value',
    'edible_count',
    'poisonous_count'])

    desiredCount = 'edible_count' if edibility == 'e' else 'poisonous_count'
    undesiredCount = 'poisonous_count' if edibility == 'e' else 'edible_count'
    
    while True:
        topFeaturePair = (countFeaturePairs(mushroomData)
            .query(f'{desiredCount} > 0 and {undesiredCount} == 0')
            .sort_values(by=[desiredCount], ascending=False)
            .iloc[0])
        minimalSet.loc[len(minimalSet.index)] = topFeaturePair
        totalAccountedFor = minimalSet[desiredCount].sum()
        alreadyConsidered1 = mushroomData[topFeaturePair['feature1_name']] == topFeaturePair['feature1_value']
        alreadyConsidered2 = mushroomData[topFeaturePair['feature2_name']] == topFeaturePair['feature2_value']
        mushroomData = mushroomData.drop(mushroomData[alreadyConsidered1 & alreadyConsidered2].index)
        if (totalAccountedFor >= edibilityCount[edibility]):
            break

    return minimalSet


edibleOnly = findMinimalSetOfIdentifyingFeaturePairs('e')
poisonousOnly = findMinimalSetOfIdentifyingFeaturePairs('p')

print('Edible-only feature pairs (minimal set)')
print('**************************************************************')
print('number of edible mushrooms expected to account for', edibilityCount['e'])
print('number of edible mushrooms actually accounted for', edibleOnly['edible_count'].sum())
display(edibleOnly)

print()

print('Poisonous-only feature pairs (minimal set)')
print('**************************************************************')
print('number of poisonous mushrooms expected to account for', edibilityCount['p'])
print('number of poisonous mushrooms actually accounted for', poisonousOnly['poisonous_count'].sum())
display(poisonousOnly)

Edible-only feature pairs (minimal set)
**************************************************************
number of edible mushrooms expected to account for 4208
number of edible mushrooms actually accounted for 4208


Unnamed: 0,feature1_name,feature1_value,feature2_name,feature2_value,edible_count,poisonous_count
0,odor,n,stalk-shape,t,2496,0
1,ring-number,t,spore-print-color,w,528,0
2,stalk-root,c,stalk-surface-below-ring,s,512,0
3,stalk-shape,e,stalk-color-above-ring,o,192,0
4,stalk-root,r,ring-type,p,192,0
5,bruises,t,habitat,d,96,0
6,odor,n,stalk-root,e,96,0
7,odor,n,stalk-surface-below-ring,f,72,0
8,stalk-color-below-ring,n,habitat,l,24,0



Poisonous-only feature pairs (minimal set)
**************************************************************
number of poisonous mushrooms expected to account for 3916
number of poisonous mushrooms actually accounted for 3916


Unnamed: 0,feature1_name,feature1_value,feature2_name,feature2_value,edible_count,poisonous_count
0,gill-spacing,c,stalk-surface-above-ring,k,0,2228
1,gill-attachment,f,gill-color,b,0,864
2,odor,f,veil-type,p,0,288
3,odor,p,stalk-root,e,0,256
4,odor,c,stalk-color-below-ring,w,0,192
5,gill-attachment,f,spore-print-color,r,0,72
6,gill-size,n,population,c,0,16
