In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e8/sample_submission.csv
/kaggle/input/playground-series-s4e8/train.csv
/kaggle/input/playground-series-s4e8/test.csv


# Data Load

In [1]:
import polars as pl

train = pl.read_csv("/kaggle/input/playground-series-s4e8/train.csv")
test = pl.read_csv("/kaggle/input/playground-series-s4e8/test.csv")
sample_submission = pl.read_csv("/kaggle/input/playground-series-s4e8/sample_submission.csv")

In [2]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
mushroom = fetch_ucirepo(id=848)
# https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset
  
# data (as Polars dataframes) 
X = pl.DataFrame(mushroom.data.features) 
y = pl.DataFrame(mushroom.data.targets)
y.columns = ['class']
others = pl.concat([y,X],how='horizontal')
train = train.drop('id')
train = pl.concat([train, others])

del mushroom, X, y, others

In [4]:
len(train)

3178014

# Data Cleaning

In [5]:
print("There are {} duplicates.".format(train.is_duplicated().sum()))

There are 266 duplicates.


In [6]:
train = train.unique()
print("There are {} duplicates.".format(train.is_duplicated().sum()))

There are 0 duplicates.


```
# 컬럼별로 다른 스케일링 방법 설정
column_transformer = ColumnTransformer(
    transformers=[
        ('standard', StandardScaler(), ['A']),  # A 컬럼에 표준화 적용
        ('minmax', MinMaxScaler(), ['B']),      # B 컬럼에 최소-최대 정규화 적용
        ('robust', RobustScaler(), ['C'])       # C 컬럼에 로버스트 스케일링 적용
    ]
)

# 훈련 세트에 대해 변환기 학습 및 변환 적용
train_scaled = column_transformer.fit_transform(train_df)
train_scaled_df = pd.DataFrame(train_scaled, columns=train_df.columns)

# 테스트 세트에 대해 변환 적용 (fit을 다시 하지 않음)
test_scaled = column_transformer.transform(test_df)
test_scaled_df = pd.DataFrame(test_scaled, columns=test_df.columns)
```

In [36]:
# Null 개수 계산 후 퍼센트로 변환
train_null = (train.null_count() / len(train)) * 100

# pandas 데이터프레임으로 변환
train_null = train_null.to_pandas()

# null 비율이 높은 순서로 컬럼 정렬
train_null_sorted = train_null.sort_values(by=train_null.index[0], axis=1, ascending=False)

In [38]:
pl.DataFrame(train_null_sorted)

veil-type,spore-print-color,stem-root,veil-color,stem-surface,gill-spacing,cap-surface,gill-attachment,ring-type,gill-color,habitat,cap-shape,stem-color,has-ring,cap-color,does-bruise-or-bleed,cap-diameter,class,stem-width,stem-height,season
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
94.882449,91.3908,88.378718,87.934961,63.532626,40.388619,21.559832,16.797142,4.133306,0.001794,0.001416,0.001259,0.001196,0.000755,0.000378,0.000252,0.000126,0.0,0.0,0.0,0.0
