In [71]:
%pip install --upgrade pip
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [72]:
from ucimlrepo import fetch_ucirepo 
from pprint import pp
import pandas

# fetch dataset 
mushroom = fetch_ucirepo(id=73)
edibilityCount = mushroom.data.original.groupby('poisonous').size()

print('original')
print('******************************************************************')
print(mushroom.data.original.columns.values)
print('\n\n')

print('targets')
print('******************************************************************')
print(mushroom.data.targets.columns.values)
print('\n\n')

print('features')
print('******************************************************************')
print(mushroom.data.features.columns.values)
print('\n\n')

print('metadata')
print('******************************************************************')
pp(mushroom.metadata)
print('\n\n')

print('variables')
print('******************************************************************')
pp(mushroom.variables)

original
******************************************************************
['cap-shape' 'cap-surface' 'cap-color' 'bruises' 'odor' 'gill-attachment'
 'gill-spacing' 'gill-size' 'gill-color' 'stalk-shape' 'stalk-root'
 'stalk-surface-above-ring' 'stalk-surface-below-ring'
 'stalk-color-above-ring' 'stalk-color-below-ring' 'veil-type'
 'veil-color' 'ring-number' 'ring-type' 'spore-print-color' 'population'
 'habitat' 'poisonous']



targets
******************************************************************
['poisonous']



features
******************************************************************
['cap-shape' 'cap-surface' 'cap-color' 'bruises' 'odor' 'gill-attachment'
 'gill-spacing' 'gill-size' 'gill-color' 'stalk-shape' 'stalk-root'
 'stalk-surface-above-ring' 'stalk-surface-below-ring'
 'stalk-color-above-ring' 'stalk-color-below-ring' 'veil-type'
 'veil-color' 'ring-number' 'ring-type' 'spore-print-color' 'population'
 'habitat']



metadata
***************************************

In [73]:
features = mushroom.data.features.columns.values
featureCount = len(features)
featureValues = dict()
featurePairs = list()

for i, feature in enumerate(features):
    featureValues[feature] = mushroom.data.features[feature].unique()

for i in range(0, featureCount - 1):
    feature1 = features[i]
    for j in range(i + 1, featureCount):
        feature2 = features[j]
        featurePairs.append([feature1, feature2])

print('feature values')
print('***************************************')
pp(featureValues)
print('\n\n')

print('feature pairs: ', len(featurePairs))
print('********************************************')
pp(featurePairs)

feature values
***************************************
{'cap-shape': array(['x', 'b', 's', 'f', 'k', 'c'], dtype=object),
 'cap-surface': array(['s', 'y', 'f', 'g'], dtype=object),
 'cap-color': array(['n', 'y', 'w', 'g', 'e', 'p', 'b', 'u', 'c', 'r'], dtype=object),
 'bruises': array(['t', 'f'], dtype=object),
 'odor': array(['p', 'a', 'l', 'n', 'f', 'c', 'y', 's', 'm'], dtype=object),
 'gill-attachment': array(['f', 'a'], dtype=object),
 'gill-spacing': array(['c', 'w'], dtype=object),
 'gill-size': array(['n', 'b'], dtype=object),
 'gill-color': array(['k', 'n', 'g', 'p', 'w', 'h', 'u', 'e', 'b', 'r', 'y', 'o'],
      dtype=object),
 'stalk-shape': array(['e', 't'], dtype=object),
 'stalk-root': array(['e', 'c', 'b', 'r', nan], dtype=object),
 'stalk-surface-above-ring': array(['s', 'f', 'k', 'y'], dtype=object),
 'stalk-surface-below-ring': array(['s', 'f', 'y', 'k'], dtype=object),
 'stalk-color-above-ring': array(['w', 'g', 'p', 'n', 'b', 'e', 'o', 'c', 'y'], dtype=object),
 'sta

In [74]:
def countFeaturePairs(mushroomData):
    result = list()
    
    for i, featurePair in enumerate(featurePairs):
        feature1 = featurePair[0]
        feature2 = featurePair[1]
        grouped = mushroomData.groupby([feature1, feature2, 'poisonous']).size()
        
        for v1, value1 in enumerate(featureValues[feature1]):
            for v2, value2 in enumerate(featureValues[feature2]):
                found = value1 in grouped and value2 in grouped[value1]
                count = grouped[value1][value2] if found else dict()
                result.append({
                    'feature1_name': feature1,
                    'feature1_value': value1,
                    'feature2_name': feature2,
                    'feature2_value': value2,
                    'edible_count': count['e'] if 'e' in count else 0,
                    'poisonous_count': count['p'] if 'p' in count else 0
                })
    
    return pandas.DataFrame(result)

In [75]:
mushroomData = mushroom.data.original
mostEdibleFeaturePairs = pandas.DataFrame(columns=[
    'feature1_name',
    'feature1_value',
    'feature2_name',
    'feature2_value',
    'edible_count',
    'poisonous_count'])

while True:
    edibleFeaturePairsSorted = (countFeaturePairs(mushroomData)
        .query('edible_count > 0 and poisonous_count == 0')
        .sort_values(by=['edible_count'], ascending=False))
    nextMostEdible = edibleFeaturePairsSorted.iloc[0]
    mostEdibleFeaturePairs.loc[len(mostEdibleFeaturePairs.index)] = nextMostEdible
    mostEdibleSum = mostEdibleFeaturePairs['edible_count'].sum()
    condition1 = mushroomData[nextMostEdible['feature1_name']] == nextMostEdible['feature1_value']
    condition2 = mushroomData[nextMostEdible['feature2_name']] == nextMostEdible['feature2_value']
    mushroomData = mushroomData.drop(mushroomData[condition1 & condition2].index)
    if (mostEdibleSum >= edibilityCount['e']):
        break

display(mostEdibleFeaturePairs)
print('number of edible mushrooms', edibilityCount['e'])
print('sum of most edible features', mostEdibleFeaturePairs['edible_count'].sum())


Unnamed: 0,feature1_name,feature1_value,feature2_name,feature2_value,edible_count,poisonous_count
0,odor,n,stalk-shape,t,2496,0
1,ring-number,t,spore-print-color,w,528,0
2,stalk-root,c,stalk-surface-below-ring,s,512,0
3,stalk-shape,e,stalk-color-above-ring,o,192,0
4,stalk-root,r,ring-type,p,192,0
5,bruises,t,habitat,d,96,0
6,odor,n,stalk-root,e,96,0
7,odor,n,stalk-surface-below-ring,f,72,0
8,stalk-color-below-ring,n,habitat,l,24,0


number of edible mushrooms 4208
sum of most edible features 4208


In [76]:
mushroomData = mushroom.data.original
mostPoisonousFeaturePairs = pandas.DataFrame(columns=[
    'feature1_name',
    'feature1_value',
    'feature2_name',
    'feature2_value',
    'edible_count',
    'poisonous_count'])

while True:
    poisonousFeaturePairsSorted = (countFeaturePairs(mushroomData)
        .query('edible_count == 0 and poisonous_count > 0')
        .sort_values(by=['poisonous_count'], ascending=False))
    nextMostPoisonous = poisonousFeaturePairsSorted.iloc[0]
    mostPoisonousFeaturePairs.loc[len(mostPoisonousFeaturePairs.index)] = nextMostPoisonous
    mostPoisonousSum = mostPoisonousFeaturePairs['poisonous_count'].sum()
    condition1 = mushroomData[nextMostPoisonous['feature1_name']] == nextMostPoisonous['feature1_value']
    condition2 = mushroomData[nextMostPoisonous['feature2_name']] == nextMostPoisonous['feature2_value']
    mushroomData = mushroomData.drop(mushroomData[condition1 & condition2].index)
    if (mostPoisonousSum >= edibilityCount['p']):
        break

display(mostPoisonousFeaturePairs)
print('number of poisonous mushrooms', edibilityCount['p'])
print('sum of most poisonous features', mostPoisonousFeaturePairs['poisonous_count'].sum())

poisonous
e    4208
p    3916
dtype: int64


Unnamed: 0,feature1_name,feature1_value,feature2_name,feature2_value,edible_count,poisonous_count
0,gill-spacing,c,stalk-surface-above-ring,k,0,2228
1,gill-attachment,f,gill-color,b,0,864
2,odor,f,veil-type,p,0,288
3,odor,p,stalk-root,e,0,256
4,odor,c,stalk-color-below-ring,w,0,192
5,gill-attachment,f,spore-print-color,r,0,72
6,gill-size,n,population,c,0,16


number of poisonous mushrooms 3916
sum of most poisonous features 3916
