<a href="https://colab.research.google.com/github/bringol/IA_TP_GRUPO_2/blob/main/Dataset_no_me_comas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset: Yellow Submarine
----

Cada fila representa un hongo, cada columna contiene atributos de este.

La columna *classes* indica si el hongo es venenoso o no.

hongos | atributos
:----:   | :----:
8124     | 23


# Variables

Nombre   |  Posibles valores
:------- |  :---
cap-shape   | (bell, conical, convex, flat, knobbed, sunken)
cap-surface | (fibrous, grooves, scaly, smooth)
cap-color | (brown, buff, cinnamon, gray, green, pink, purple, red, white, yellow)
bruises | (bruises, no)
odor | (almond, anise, creosote, fishy, foul, musty, none, pungent, spicy)
gill-attachment | (attached, descending, free, notched)
gill-spacing | (close, crowded, distant)
gill-size | (broad, narrow)
gill-color | (black, brown, buff, chocolate, gray, green, orange, pink, purple, red, white, yellow)
stalk-shape | (enlarging, tapering)
stalk-root | (bulbous, club, cup, equal, rhizomorphs, rooted, missing)
stalk-surface-above-ring | (fibrous, scaly, silky, smooth)
stalk-surface-below-ring | (fibrous, scaly, silky, smooth)
stalk-color-above-ring | (brown, buff, cinnamon, gray, orange, pink, red, white, yellow)
stalk-color-below-ring | (brown, buff, cinnamon, gray, orange, pink, red, white, yellow)
veil-type | (partial, universal)
veil-color | (brown, orange, white, yellow)
ring-number | (none, one, two)
ring-type | (cobwebby, evanescent, flaring, large, none, pendant, sheathing, zone)
spore-print-color | (black, brown, buff, chocolate, green, orange, purple, white, yellow)
population | (abundant, clustered, numerous, scattered ,several, solitary)
habitat | (grasses, leaves, meadows, paths, urban, waste, woods)
classes   |  (edible, poisonous)

#Cambiar "poisson" por "poisonous"

In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Cargar los datos
mushroom_data = pd.read_csv('Yellow_Submarine.csv')

In [7]:
# Verificar valores faltantes
missing_values = mushroom_data.isnull().sum()
print("Valores faltantes en cada columna:")
print(missing_values)

Valores faltantes en cada columna:
class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64


In [8]:
# Convertir todas las categorías a minúsculas
for col in mushroom_data.select_dtypes(include=['object']).columns:
    mushroom_data[col] = mushroom_data[col].str.lower()

In [9]:
# Original
print(mushroom_data['class'].value_counts())

# Cambiar el nombre 'poisson' a 'poisonous' en la columna 'class'
mushroom_data['class'] = mushroom_data['class'].replace({'poisson': 'poisonous'})

# Verificar que el cambio se haya realizado
print(mushroom_data['class'].value_counts())

class
edible     4208
poisson    3916
Name: count, dtype: int64
class
edible       4208
poisonous    3916
Name: count, dtype: int64


In [10]:
# Lista de columnas a verificar
columns_to_check = [
    'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
    'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
    'stalk-surface-below-ring', 'stalk-color-above-ring',
    'stalk-color-below-ring', 'veil-color', 'ring-number',
    'ring-type', 'spore-print-color', 'population',
    'habitat'
]

# Verificar la distribución de "edible" y "poisonous" para cada categoría en cada columna
for column in columns_to_check:
    print(f"Distribución de 'class' para cada valor en '{column}':")
    distribution = mushroom_data.groupby([column, 'class']).size().unstack(fill_value=0)
    print(distribution)
    print("\n")

Distribución de 'class' para cada valor en 'cap-shape':
class      edible  poisonous
cap-shape                   
bell          404         48
conical         0          4
convex       1948       1708
flat         1596       1556
knobbed       228        600
sunken         32          0


Distribución de 'class' para cada valor en 'cap-surface':
class        edible  poisonous
cap-surface                   
fibrous        1560        760
grooves           0          4
scaly          1504       1740
smooth         1144       1412


Distribución de 'class' para cada valor en 'cap-color':
class      edible  poisonous
cap-color                   
brown        1264       1020
buff           48        120
cinnamon       32         12
gray         1032        808
green          16          0
pink           56         88
purple         16          0
red           624        876
white         720        320
yellow        400        672


Distribución de 'class' para cada valor en 'bruises':
clas

In [11]:
# Función para consolidar categorías escasas
def consolidate_categories(df, column, min_count=50):
    value_counts = df[column].value_counts()
    to_consolidate = value_counts[value_counts < min_count].index
    df[column] = df[column].apply(lambda x: 'other' if x in to_consolidate else x)
    return df

# Especificar las columnas a consolidar
columns_to_consolidate = [
    'cap-shape', 'cap-surface', 'cap-color', 'stalk-color-above-ring',
    'stalk-color-below-ring', 'veil-color', 'ring-type', 'spore-print-color'
]

# Guardar la distribución antes de la consolidación
before_consolidation = {}
for column in columns_to_consolidate:
    before_consolidation[column] = mushroom_data[column].value_counts()

# Aplicar la consolidación a las columnas específicas
for column in columns_to_consolidate:
    mushroom_data = consolidate_categories(mushroom_data, column, min_count=50)

# Guardar la distribución después de la consolidación
after_consolidation = {}
for column in columns_to_consolidate:
    after_consolidation[column] = mushroom_data[column].value_counts()

# Mostrar antes y después de la consolidación
consolidation_comparison = {}
for column in columns_to_consolidate:
    consolidation_comparison[column] = pd.DataFrame({
        'Antes de consolidación': before_consolidation[column],
        'Después de consolidación': after_consolidation[column]
    }).fillna(0)

# Ejemplo de resultado para una columna
example_column = 'cap-shape'
print(consolidation_comparison[example_column])

           Antes de consolidación  Después de consolidación
cap-shape                                                  
bell                        452.0                     452.0
conical                       4.0                       0.0
convex                     3656.0                    3656.0
flat                       3152.0                    3152.0
knobbed                     828.0                     828.0
other                         0.0                      36.0
sunken                       32.0                       0.0


In [12]:
columns_to_check = [
    'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
    'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
    'stalk-surface-below-ring', 'stalk-color-above-ring',
    'stalk-color-below-ring', 'veil-type', 'veil-color',
    'ring-number', 'ring-type', 'spore-print-color', 'population',
    'habitat', 'class'
]

# Verificar y mostrar la distribución de valores para cada columna
for column in columns_to_check:
    print(f"Distribución de valores para '{column}':")
    print(mushroom_data[column].value_counts())
    print("\n")

Distribución de valores para 'cap-shape':
cap-shape
convex     3656
flat       3152
knobbed     828
bell        452
other        36
Name: count, dtype: int64


Distribución de valores para 'cap-surface':
cap-surface
scaly      3244
smooth     2556
fibrous    2320
other         4
Name: count, dtype: int64


Distribución de valores para 'cap-color':
cap-color
brown     2284
gray      1840
red       1500
yellow    1072
white     1040
buff       168
pink       144
other       76
Name: count, dtype: int64


Distribución de valores para 'bruises':
bruises
no         4748
bruises    3376
Name: count, dtype: int64


Distribución de valores para 'odor':
odor
none        3528
foul        2160
fishy        576
spicy        576
almond       400
anise        400
pungent      256
creosote     192
musty         36
Name: count, dtype: int64


Distribución de valores para 'gill-attachment':
gill-attachment
free        7914
attached     210
Name: count, dtype: int64


Distribución de valores para 'gill-

#Consolidar columnas agrupando categorías en "other"


In [5]:
# Función para consolidar categorías escasas
def consolidate_categories(df, column, categories_to_consolidate):
    df[column] = df[column].apply(lambda x: 'other' if x in categories_to_consolidate else x)
    return df

# Especificar las categorías a consolidar en cada columna
categories_to_consolidate = {
    'cap-shape': ['conical', 'sunken'],
    'cap-surface': ['grooves'],
    'cap-color': ['cinnamon', 'purple', 'green'],
    'gill-color': ['red', 'yellow', 'orange', 'green'],
    'stalk-root': ['rooted'],
    'stalk-surface-above-ring': ['scaly'],
    'stalk-surface-below-ring': ['scaly'],
    'stalk-color-above-ring': ['cinnamon', 'yellow'],
    'stalk-color-below-ring': ['cinnamon', 'yellow'],
    'veil-color': ['yellow'],
    'ring-number': ['none'],
    'ring-type': ['flaring', 'none'],
    'spore-print-color': ['green', 'purple', 'orange', 'yellow', 'buff']
}

# Aplicar la consolidación a las columnas específicas
for column, categories in categories_to_consolidate.items():
    mushroom_data = consolidate_categories(mushroom_data, column, categories)

# Eliminar características irrelevantes
mushroom_data.drop(['veil-type'], axis=1, inplace=True)

# Verificar la nueva distribución
for column in mushroom_data.columns:
    print(f"Distribución de valores para '{column}':")
    print(mushroom_data[column].value_counts())
    print("\n")

Distribución de valores para 'class':
class
edible     4208
poisson    3916
Name: count, dtype: int64


Distribución de valores para 'cap-shape':
cap-shape
convex     3656
flat       3152
knobbed     828
bell        452
other        36
Name: count, dtype: int64


Distribución de valores para 'cap-surface':
cap-surface
scaly      3244
smooth     2556
fibrous    2320
other         4
Name: count, dtype: int64


Distribución de valores para 'cap-color':
cap-color
brown     2284
gray      1840
red       1500
yellow    1072
white     1040
buff       168
pink       144
other       76
Name: count, dtype: int64


Distribución de valores para 'bruises':
bruises
no         4748
bruises    3376
Name: count, dtype: int64


Distribución de valores para 'odor':
odor
none        3528
foul        2160
fishy        576
spicy        576
almond       400
anise        400
pungent      256
creosote     192
musty         36
Name: count, dtype: int64


Distribución de valores para 'gill-attachment':
gill-atta

In [6]:
#quiero ver la dist de los veil color respecto a si son venenosos o no
# Filtrar y verificar la distribución de la clase para cada valor de 'veil-color' excepto 'white'
veil_colors_to_check = ['brown', 'orange', 'other']

for veil_color in veil_colors_to_check:
    print(f"Distribución de 'class' para 'veil-color' = {veil_color}:")
    subset = mushroom_data[mushroom_data['veil-color'] == veil_color]
    print(subset['class'].value_counts())
    print("\n")

Distribución de 'class' para 'veil-color' = brown:
class
edible    96
Name: count, dtype: int64


Distribución de 'class' para 'veil-color' = orange:
class
edible    96
Name: count, dtype: int64


Distribución de 'class' para 'veil-color' = other:
class
poisson    8
Name: count, dtype: int64




In [7]:
#quiero ver la dist de los x a si son venenosos o no
# Filtrar y verificar la distribución de la clase para cada valor de 'gill_color' excepto 'white'
gill_colors_to_check = ['red', 'yellow', 'orange', 'green']

for gill_color in gill_colors_to_check:
    print(f"Distribución de 'class' para 'veil-color' = {gill_color}:")
    subset = mushroom_data[mushroom_data['veil-color'] == gill_color]
    print(subset['class'].value_counts())
    print("\n")

Distribución de 'class' para 'veil-color' = red:
Series([], Name: count, dtype: int64)


Distribución de 'class' para 'veil-color' = yellow:
Series([], Name: count, dtype: int64)


Distribución de 'class' para 'veil-color' = orange:
class
edible    96
Name: count, dtype: int64


Distribución de 'class' para 'veil-color' = green:
Series([], Name: count, dtype: int64)




In [4]:
# Check the class distribution
class_distribution = mushroom_data['class'].value_counts()
print(class_distribution)

class
edible     4208
poisson    3916
Name: count, dtype: int64


In [6]:
# Verificar la distribución de valores en la columna "veil-type"
veil_type_distribution = mushroom_data['veil-type'].value_counts()
print(veil_type_distribution)

veil-type
partial    8124
Name: count, dtype: int64


In [7]:
  # Verificar si hay casos en los que "ring-number" tenga "none"
none_ring_number = mushroom_data[mushroom_data['ring-number'] == 'none']

# Mostrar los resultados
print(none_ring_number)

        class cap-shape cap-surface cap-color bruises   odor gill-attachment  \
6415  poisson    convex       scaly       red      no  musty            free   
6668  poisson   knobbed       scaly  cinnamon      no  musty        attached   
6855  poisson   knobbed       scaly     brown      no  musty            free   
6945  poisson      flat       scaly       red      no  musty        attached   
6991  poisson   knobbed       scaly       red      no  musty            free   
7034  poisson   knobbed       scaly  cinnamon      no  musty            free   
7065  poisson    convex       scaly     brown      no  musty        attached   
7091  poisson      flat       scaly     brown      no  musty            free   
7100  poisson    convex       scaly  cinnamon      no  musty        attached   
7111  poisson   knobbed       scaly     brown      no  musty        attached   
7146  poisson      flat       scaly  cinnamon      no  musty            free   
7166  poisson      flat       scaly     

In [3]:
# Function to check unique values in each categorical column
def check_inconsistencies(df):
    inconsistencies = {}
    for column in df.columns:
        unique_values = df[column].unique()
        if df[column].dtype == 'object':
            inconsistencies[column] = unique_values
    return inconsistencies

# Checking for inconsistencies
inconsistencies = check_inconsistencies(mushroom_data)
inconsistencies

{'class': array(['poisson', 'edible'], dtype=object),
 'cap-shape': array(['convex', 'bell', 'sunken', 'flat', 'knobbed', 'conical'],
       dtype=object),
 'cap-surface': array(['smooth', 'scaly', 'fibrous', 'grooves'], dtype=object),
 'cap-color': array(['brown', 'yellow', 'white', 'gray', 'red', 'pink', 'buff',
        'purple', 'cinnamon', 'green'], dtype=object),
 'bruises': array(['bruises', 'no'], dtype=object),
 'odor': array(['pungent', 'almond', 'anise', 'none', 'foul', 'creosote', 'fishy',
        'spicy', 'musty'], dtype=object),
 'gill-attachment': array(['free', 'attached'], dtype=object),
 'gill-spacing': array(['close', 'crowded'], dtype=object),
 'gill-size': array(['narrow', 'broad'], dtype=object),
 'gill-color': array(['black', 'brown', 'gray', 'pink', 'white', 'chocolate', 'purple',
        'red', 'buff', 'green', 'yellow', 'orange'], dtype=object),
 'stalk-shape': array(['enlarging', 'tapering'], dtype=object),
 'stalk-root': array(['equal', 'club', 'bulbous', 'ro

Existen valores de ring-number con "none" y a su vez son consistentes con el parecen ser consistentes con el hecho de que si de da tal caso, entonces en "ring-type" habrá tamb "none" porque no hay anillo.

In [6]:
# Verificar si hay casos en los que "gill-attachment" no sea "free"
non_free_gill_attachment = mushroom_data[mushroom_data['gill-attachment'] != 'free']

# Mostrar los resultados
print(non_free_gill_attachment)

       class cap-shape cap-surface cap-color bruises  odor gill-attachment  \
6038  edible      bell      smooth     brown      no  none        attached   
6040  edible    convex      smooth     brown      no  none        attached   
6375  edible      bell      smooth     brown      no  none        attached   
6424  edible    convex      smooth     brown      no  none        attached   
6434  edible    convex      smooth     brown      no  none        attached   
...      ...       ...         ...       ...     ...   ...             ...   
8115  edible    convex      smooth     brown      no  none        attached   
8119  edible   knobbed      smooth     brown      no  none        attached   
8120  edible    convex      smooth     brown      no  none        attached   
8121  edible      flat      smooth     brown      no  none        attached   
8123  edible    convex      smooth     brown      no  none        attached   

     gill-spacing gill-size gill-color  ... stalk-surface-below

In [2]:
# Codificación de Variables Categóricas (One-Hot Encoding)
data_encoded = pd.get_dummies(mushroom_data, columns=mushroom_data.select_dtypes(include=['object']).columns)

NameError: name 'pd' is not defined

In [None]:
# Separar características y etiqueta
X = data_encoded.drop(columns=['class_edible', 'class_poisson'])
y = data_encoded[['class_edible', 'class_poisson']]

In [None]:
# Mostrar la estructura del dataset preprocesado
print("Dataset preprocesado:")
print(X.head())
print(y.head())

Dataset preprocesado:
   cap-shape_bell  cap-shape_conical  cap-shape_convex  cap-shape_flat  \
0           False              False              True           False   
1           False              False              True           False   
2            True              False             False           False   
3           False              False              True           False   
4           False              False              True           False   

   cap-shape_knobbed  cap-shape_sunken  cap-surface_fibrous  \
0              False             False                False   
1              False             False                False   
2              False             False                False   
3              False             False                False   
4              False             False                False   

   cap-surface_grooves  cap-surface_scaly  cap-surface_smooth  ...  \
0                False              False                True  ...   
1             