In [1]:
import pandas as pd
import numpy as np
import re
import snappy
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_parquet("../data/train.snappy", engine="fastparquet")

In [4]:
def extraer_olor(texto):
    palabras = texto.split()
    if "olor" in palabras:
        index = palabras.index("olor")
        if index + 1 < len(palabras):
            return palabras[index + 1].strip(".")
    return None

def extraer_color_tallo_abajo(texto):
    partes = texto.split("y por debajo del anillo es")
    if len(partes) > 2:
        return partes[2].split()[0]
    elif len(partes) > 1:
        return partes[1].split()[0]
    return None 

def extraer_tipo_anillo(texto):
    match = re.search(r"anillo\(s\),\s+de tipo\s+(\w+)", texto)
    return match.group(1) if match else None

In [5]:
df["descripcion"] = df["descripcion"].str.replace(r"\*\*", "", regex=True).str.strip()

df_extracted = pd.DataFrame()

df_extracted["Observacion"] = df["descripcion"].str.extract(r"La observación (\d+)")
df_extracted["clase"] = df["descripcion"].str.extract(r"clase (\w+)")
df_extracted["cap-shape"] = df["descripcion"].str.extract(r"forma de sombrero (\w+)")
df_extracted["cap-surface"] = df["descripcion"].str.extract(r"superficie (\w+)")
df_extracted["cap-color"] = df["descripcion"].str.extract(r"color (\w+)")
df_extracted["odor"] = df["descripcion"].apply(extraer_olor)
df_extracted["gill-attachment"] = df["descripcion"].str.extract(r"branquias son de tipo (\w+)")
df_extracted["gill-spacing"] = df["descripcion"].str.extract(r"separación (\w+)")
df_extracted["gill-size"] = df["descripcion"].str.extract(r"tamaño (\w+)")
df_extracted["gill-color"] = df["descripcion"].str.extract(r"color de las branquias es (\w+)")
df_extracted["stalk-shape"] = df["descripcion"].str.extract(r"forma del tallo es (\w+)")
df_extracted["stalk-root"] = df["descripcion"].str.extract(r"raíz del tallo es (\w+)")
df_extracted["stalk-surface-above-ring"] = df["descripcion"].str.extract(r"superficie del tallo por encima del anillo es (\w+)")
df_extracted["stalk-surface-below-ring"] = df["descripcion"].str.extract(r"y por debajo del anillo es (\w+)")
df_extracted["stalk-color-above-ring"] = df["descripcion"].str.extract(r"color del tallo por encima del anillo es (\w+)")
df_extracted["stalk-color-below-ring"] = df["descripcion"].apply(extraer_color_tallo_abajo)
df_extracted["veil-type"] = df["descripcion"].str.extract(r"tipo de velo es (\w+)")
df_extracted["veil-color"] = df["descripcion"].str.extract(r"color de velo (\w+)")
df_extracted["ring-number"] = df["descripcion"].str.extract(r"Tiene (\w+) anillo")
df_extracted["ring-type"] = df["descripcion"].apply(extraer_tipo_anillo)
df_extracted["spore-print-color"] = df["descripcion"].str.extract(r"impresión de esporas es (\w+)")
df_extracted["population"] = df["descripcion"].str.extract(r"población (\w+)")
df_extracted["habitat"] = df["descripcion"].str.extract(r"hábitat es (\w+)")
df_extracted["bruises"] = df["descripcion"].str.extract(r"(?i)(Presenta|No presenta)\s+moretones")


df_extracted["Observacion"] = pd.to_numeric(df_extracted["Observacion"])
df_extracted = df_extracted.sort_values(by="Observacion").reset_index(drop=True)

In [6]:
df = df_extracted

In [7]:
df.isna().sum()

Observacion                    0
clase                          0
cap-shape                      0
cap-surface                    0
cap-color                      0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  1267
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
bruises                        0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4603 entries, 0 to 4602
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Observacion               4603 non-null   int64 
 1   clase                     4603 non-null   object
 2   cap-shape                 4603 non-null   object
 3   cap-surface               4603 non-null   object
 4   cap-color                 4603 non-null   object
 5   odor                      4603 non-null   object
 6   gill-attachment           4603 non-null   object
 7   gill-spacing              4603 non-null   object
 8   gill-size                 4603 non-null   object
 9   gill-color                4603 non-null   object
 10  stalk-shape               4603 non-null   object
 11  stalk-root                3336 non-null   object
 12  stalk-surface-above-ring  4603 non-null   object
 13  stalk-surface-below-ring  4603 non-null   object
 14  stalk-color-above-ring  

In [9]:
df_ob = df.select_dtypes('object')
for col in df_ob.columns:
    print(f'==========={col}===========')
    print(f'Numero de unicos: {df[col].nunique()}')
    print(f'Valores Unicos: {df[col].unique()}')
    print('\n')

Numero de unicos: 2
Valores Unicos: ['edible' 'poisonous']


Numero de unicos: 6
Valores Unicos: ['convex' 'bell' 'sunken' 'flat' 'knobbed' 'c']


Numero de unicos: 3
Valores Unicos: ['smooth' 'scaly' 'fibrous']


Numero de unicos: 9
Valores Unicos: ['yellow' 'white' 'green' 'brown' 'red' 'pink' 'buff' 'cinnamon' 'purple']


Numero de unicos: 9
Valores Unicos: ['almond' 'anise' 'null' 'pungent' 'foul' 'creosote' 'spicy' 'fishy'
 'musty']


Numero de unicos: 2
Valores Unicos: ['free' 'attached']


Numero de unicos: 2
Valores Unicos: ['close' 'crowded']


Numero de unicos: 2
Valores Unicos: ['broad' 'narrow']


Numero de unicos: 12
Valores Unicos: ['black' 'brown' 'gray' 'pink' 'white' 'chocolate' 'purple' 'red' 'buff'
 'green' 'yellow' 'orange']


Numero de unicos: 2
Valores Unicos: ['enlarging' 'tapering']


Numero de unicos: 4
Valores Unicos: ['club' 'equal' 'rooted' 'bulbous' nan]


Numero de unicos: 4
Valores Unicos: ['smooth' 'fibrous' 'silky' 'scaly']


Numero de unicos: 4
Valores

In [10]:
df['ring-type'] = df['ring-type'].replace('null',np.nan)
df['ring-number'] = df['ring-number'].replace('null',np.nan)
df['odor'] = df['odor'].replace('null',np.nan)
df['stalk-color-below-ring'] = df['stalk-color-below-ring'].replace('.','')

In [11]:
df.isna().sum()

Observacion                    0
clase                          0
cap-shape                      0
cap-surface                    0
cap-color                      0
odor                        2470
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  1267
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                   15
ring-type                     15
spore-print-color              0
population                     0
habitat                        0
bruises                        0
dtype: int64

In [12]:
df.head()

Unnamed: 0,Observacion,clase,cap-shape,cap-surface,cap-color,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,bruises
0,0,edible,convex,smooth,yellow,almond,free,close,broad,black,...,white,white.,partial,white,one,pendant,brown,numerous,grasses,Presenta
1,1,edible,bell,smooth,white,anise,free,close,broad,brown,...,white,white.,partial,white,one,pendant,brown,numerous,meadows,Presenta
2,3,edible,convex,smooth,green,,free,crowded,broad,black,...,white,white.,partial,white,one,evanescent,brown,abundant,grasses,No presenta
3,4,edible,convex,scaly,yellow,almond,free,close,broad,brown,...,white,white.,partial,white,one,pendant,black,numerous,grasses,Presenta
4,5,edible,bell,smooth,white,almond,free,close,broad,gray,...,white,white.,partial,white,one,pendant,black,numerous,meadows,Presenta
