# RandomizedSearch

총 조합수 1400개 중 `n_iter=60` 에 따라 60개를 random하게 조합하여 테스트한 결과 값이기 때문에 <u>매번 학습하고 확인할 때마다 최적의 파라미터와 정확도 값이 다릅니다.</u><br>따라서, 이를 실행했을 때 <u>나온 파라미터 값들을 그 주위의 값들과 함께 다시 넣어서</u><br> `GridSearch`를 통해 **세부화해서 조금 더 최적의 하이퍼 파라미터 값들을 찾을 수 있습니다.**

In [9]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np

tree3 = DecisionTreeClassifier(random_state=0)
params3 = {
    "max_depth":range(1, 6), # 5
    "max_leaf_nodes":range(3, 31), # 28 
    "max_features":np.arange(0.1, 1.1, 0.1) # 10 # 학습시 사용할 컬럼의 비율
}
print("총 조합수:", (5 * 28 *10))

rs = RandomizedSearchCV(tree3, # 모델
                        params3, # 하이퍼파라미터 후보
                        n_iter=60, # 테스트해볼 조합의 개수.
                        scoring="accuracy", 
                        cv=4, 
                        n_jobs=-1
                       )
rs.fit(X_train, y_train)

총 조합수: 1400


In [11]:
print("best score:", rs.best_score_)
print("best parameter:", rs.best_params_)

best score: 0.9553650149885382
best parameter: {'max_leaf_nodes': 16, 'max_features': 0.4, 'max_depth': 3}


# TODO Adult dataset

- 전처리
    - 범주형
        - 결측치는 최빈값으로 대체한다.
        - 원핫인코딩 처리한다.
    - 연속형
        - 결측치는 중앙값으로 대체한다.
        - StandardScaling을 한다.
- Model: `sklearn.linear_model.LogisticRegression(max_iter=2000)` 를 사용
- Pipeline을 이용해 전처리와 모델을 묶어준다.

In [2]:
cols = ['age', 'workclass','fnlwgt','education', 'education-num', 'marital-status', 'occupation','relationship', 'race', 'gender','capital-gain','capital-loss', 'hours-per-week','native-country', 'income']
category_columns = ['workclass','education','marital-status', 'occupation','relationship','race','gender','native-country']
continuous_columns = ['age','fnlwgt', 'education-num','capital-gain','capital-loss','hours-per-week']
target = 'income'

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
import pandas as pd
df = pd.read_csv('../data/adult.data', 
                  header=None,
                  names=cols,
                  skipinitialspace=True,
                  na_values="?")

# SVM

## Linear SVM 
- 하이퍼 파라미터 'C' too ☝ : overfitting / too 👇 : underfitting

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [13]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

## Kernel SVM (비선형(Non Linear) SVM)
- 하이퍼 파라미터 'C', 'gamma' -> too ☝ : overfitting / too 👇 : underfitting


In [15]:
### gamma도 C와 동일
# 작은 값 =================== 적당한값 =================== 큰값
#underfitting              generalization          overfitting
#<---soft margin (약한규제)                -----> hard margin (강한규제)


### 그러나 gamma 와 C는 각자 독립시행이다. 서로에게 영향을 안줌!

# K-최근접 이웃 (K-Nearest Neighbors, KNN)

### 위스콘신 유방암 데이터를 이용한 암환자분류

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.metrics import accuracy_score

In [18]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

### Boston Housing Dataset 집값 예측
- 회귀문제

In [21]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor

import pandas as pd

In [23]:
boston = pd.read_csv('../data/boston_hosing.csv')
X = boston.drop(columns='MEDV')
y = boston['MEDV'] ## MEDV 가 집값..

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)