<a href="https://colab.research.google.com/github/busung/machine-learning-practice/blob/main/5_scikit_learn_%EC%8B%A4%EC%8A%B5_%EB%8D%B0%EC%9D%B4%ED%84%B0_%EC%A0%84%EC%B2%98%EB%A6%AC(%EC%86%8C%EC%9B%A8%EC%9C%B5_5%EC%A3%BC%EC%B0%A8).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# scikit learn 템플릿

## 데이터 불러오기
1. sickit-learn에 있는 기초 데이터 활용
2. pandas를 이용한 데이터 불러오기
  * pd.read_csv()



In [1]:
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [4]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [3]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

## 데이터 전처리
* NaN, Null, 데이터 없는 것 처리
* 레이블 인코딩
* 원-핫 인코딩
* StandardScaler
* MinMaxScaler

### 레이블 인코딩
* 머신러닝 데이터는 '숫자'만 입력가능
* 머신러닝 데이터는 '공백' 이나 'NaN'을 허락하지 않음
* 이런 문제들을 해결해 주는 것이 레이블 인코딩
  * 항목을 숫자로 전환 시켜줌
* 근데 잘 안씀
  * 숫자로 변경 => 의미가 생기기 시작(위계가 생기기 시작)
  * 2>1이나 1이 좋은 숫자다! 같은 생각들

In [8]:
from sklearn.preprocessing import LabelEncoder

items = ['아이언맨','캡틴아메리카','토르','헐크','블랙위도우']
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print(labels)
print(encoder.classes_)
print(encoder.inverse_transform([1,1,2,3,4,0,0]))

[1 2 3 4 0]
['블랙위도우' '아이언맨' '캡틴아메리카' '토르' '헐크']
['아이언맨' '아이언맨' '캡틴아메리카' '토르' '헐크' '블랙위도우' '블랙위도우']


### 원-핫 인코딩(one-hot)
* 레이블 인코딩의 문제점인 위계 발생을 해결해 줌

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

items = ['아이언맨','캡틴아메리카','토르','헐크','블랙위도우']
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)

labels = labels.reshape(-1,1)#2차원 데이터로 변경

one_hot_encoder = OneHotEncoder()
one_hot_encoder.fit(labels)
one_hot_labels = one_hot_encoder.transform(labels)

print(one_hot_labels.toarray())

[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]]


In [10]:
#pandas를 이용한 
import pandas as pd
df = pd.DataFrame({'items':['아이언맨', '캡틴아메리카', '토르', '헐크', '블랙위도우']})
pd.get_dummies(df)

Unnamed: 0,items_블랙위도우,items_아이언맨,items_캡틴아메리카,items_토르,items_헐크
0,0,1,0,0,0
1,0,0,1,0,0
2,0,0,0,1,0
3,0,0,0,0,1
4,1,0,0,0,0


### StandardScaler
* 단위가 안 맞을 때 활용
* 평균을 0에 가깝게
* 분산을 일정하게

In [13]:
import pandas as pd
iris_df = pd.DataFrame(data = iris.data,columns=iris.feature_names)
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [18]:
print(iris_df.mean())
print(iris_df.var())

sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64
sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64


In [17]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(iris_df)
iris_scaled = ss.transform(iris_df)

iris_scaled_df = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)
iris_scaled_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


In [19]:
print(iris_scaled_df.mean())
print(iris_scaled_df.var())

sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64
sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64


### MinMaxScaler
* 최대1,최소0으로 변환시켜 주는 Scaler

In [20]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms.fit(iris_df)
iris_scaled = mms.transform(iris_df)

iris_scaled_df = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)
iris_scaled_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


In [21]:
print(iris_scaled_df.mean())
print(iris_scaled_df.var())

sepal length (cm)    0.428704
sepal width (cm)     0.440556
petal length (cm)    0.467458
petal width (cm)     0.458056
dtype: float64
sepal length (cm)    0.052908
sepal width (cm)     0.032983
petal length (cm)    0.089522
petal width (cm)     0.100869
dtype: float64


In [22]:
print(iris_scaled_df.min())
print(iris_scaled_df.max())

sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64


### 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target)

In [None]:
print(x_train.shape,x_test.shape)

(112, 4) (38, 4)
