# ref

- 선형대수와 통계학으로 배우는 머신러닝 with 파이썬

- [github](https://github.com/bjpublic/MachineLearning)

# k-최근법 이웃 알고리즘

In [2]:

# 데이터 불러오기
from sklearn import datasets
raw_iris = datasets.load_iris()

# 피쳐/타겟
X = raw_iris.data
y = raw_iris.target

# 트레이닝/테스트 데이터 분할
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te=train_test_split(X,y,random_state=0)

#데이터 표준화
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std  = std_scale.transform(X_te)

In [3]:
# 학습
from sklearn.neighbors import KNeighborsClassifier
clf_knn =  KNeighborsClassifier(n_neighbors=2)
clf_knn.fit(X_tn_std, y_tn)

In [4]:
# 예측
knn_pred = clf_knn.predict(X_te_std)
print(knn_pred)

[2 1 0 2 0 2 0 1 1 1 1 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]


In [8]:
# 정확도
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, knn_pred)
print(accuracy)

0.9473684210526315


In [9]:
# confusion matrix 확인 
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, knn_pred)
print(conf_matrix)

[[13  0  0]
 [ 0 15  1]
 [ 0  1  8]]


In [10]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, knn_pred)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      0.94      0.94        16
           2       0.89      0.89      0.89         9

    accuracy                           0.95        38
   macro avg       0.94      0.94      0.94        38
weighted avg       0.95      0.95      0.95        38



# 선형 회귀 분석

```python
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression 

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


# 데이터 불러오기
raw_boston = datasets.load_boston()

# 피쳐, 타겟 데이터 지정
X = raw_boston.data
y = raw_boston.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X,y,random_state=1)


# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std  = std_scale.transform(X_te)

# 선형 회귀분석 학습
clf_lr =  LinearRegression()
clf_lr.fit(X_tn_std, y_tn)

# 선형 회귀분석 모형 추정 계수 확인
print(clf_lr.coef_)
print(clf_lr.intercept_)

# 릿지 회귀분석(L2 제약식 적용)
clf_ridge = Ridge(alpha=1)
clf_ridge.fit(X_tn_std, y_tn)

# 릿지 회귀분석 모형 추정 계수 확인
print(clf_ridge.coef_)
print(clf_ridge.intercept_)

# 라쏘 회귀분석(L1 제약식 적용)
clf_lasso = Lasso(alpha=0.01)
clf_lasso.fit(X_tn_std, y_tn)

# 라쏘 회귀분석 모형 추정 계수 확인
print(clf_lasso.coef_)
print(clf_lasso.intercept_)

# 엘라스틱넷
clf_elastic = ElasticNet(alpha=0.01, l1_ratio=0.01)
clf_elastic.fit(X_tn_std, y_tn)

# 엘라스틱넷 모형 추정 계수 확인
print(clf_elastic.coef_)
print(clf_elastic.intercept_)

# 예측
pred_lr = clf_lr.predict(X_te_std)
pred_ridge = clf_ridge.predict(X_te_std)
pred_lasso = clf_lasso.predict(X_te_std)
pred_elastic = clf_elastic.predict(X_te_std)

# 모형 평가-R제곱값
print(r2_score(y_te, pred_lr))
print(r2_score(y_te, pred_ridge))
print(r2_score(y_te, pred_lasso))
print(r2_score(y_te, pred_elastic))

# 모형 평가-MSE
print(mean_squared_error(y_te, pred_lr))
print(mean_squared_error(y_te, pred_ridge))
print(mean_squared_error(y_te, pred_lasso))
print(mean_squared_error(y_te, pred_elastic))
```

`-` 회귀분석

$$\hat w = (X^TX)^{-1}X^Ty$$

`-` 릿지 회귀 분석(L2제약식)

$$\hat w^{ridge} = (X^TX+ \lambda I_p)^{-1}X^Ty$$

- $\lambda$ 계수의 사이즈 조절, 정규식의 크기 조절, 0에 가까울수록 최소 제곱 추정량에 가까워지며 무한대에 가까워질수록 릿지 해는 0에 가까워짐

- 편향(bias)가 존재

`-` 라쏘 회귀 분석(L1제약식)

$$\hat w^{lasso}=argmin_w \{(y-Xw)^T(y-Xw)+\lambda(|w|-t) \}$$

# 로지스틱 회귀 분석

In [13]:
# 데이터 불러오기
from sklearn import datasets
raw_cancer = datasets.load_breast_cancer()

# 피쳐, 타겟 데이터 지정
X = raw_cancer.data
y = raw_cancer.target

# 트레이닝/테스트 데이터 분할
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te=train_test_split(X,y,random_state=0)



In [15]:

#데이터 표준화
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std  = std_scale.transform(X_te)

In [17]:
# 로지스틱 회귀분석(L2 제약식 적용)
from sklearn.linear_model import LogisticRegression
clf_logi_l2 =  LogisticRegression(penalty='l2')
clf_logi_l2.fit(X_tn_std, y_tn)

In [19]:
# 로지스틱 회귀분석 모형(L2 제약식 적용) 추정 계수
print(clf_logi_l2.coef_) # 추정 계수
print(clf_logi_l2.intercept_) # 상수항

[[-0.29792942 -0.58056355 -0.3109406  -0.377129   -0.11984232  0.42855478
  -0.71131106 -0.85371164 -0.46688191  0.11762548 -1.38262136  0.0899184
  -0.94778563 -0.94686238  0.18575731  0.99305313  0.11090349 -0.3458275
   0.20290919  0.80470317 -0.91626377 -0.91726667 -0.8159834  -0.86539197
  -0.45539191  0.10347391 -0.83009341 -0.98445173 -0.5920036  -0.61086989]]
[0.02713751]


In [20]:
# 예측
pred_logistic = clf_logi_l2.predict(X_te_std)
print(pred_logistic)

[0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0
 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1
 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 0]


In [21]:
# 확률값으로 예측
pred_proba = clf_logi_l2.predict_proba(X_te_std)
print(pred_proba)

[[9.98638613e-01 1.36138656e-03]
 [3.95544804e-02 9.60445520e-01]
 [1.30896362e-03 9.98691036e-01]
 [1.24473354e-02 9.87552665e-01]
 [2.44132101e-04 9.99755868e-01]
 [4.50491513e-03 9.95495085e-01]
 [1.13985968e-04 9.99886014e-01]
 [1.82475894e-03 9.98175241e-01]
 [9.67965506e-05 9.99903203e-01]
 [1.75222878e-06 9.99998248e-01]
 [1.76572612e-01 8.23427388e-01]
 [8.24119135e-02 9.17588087e-01]
 [9.66067493e-06 9.99990339e-01]
 [5.39343196e-01 4.60656804e-01]
 [3.98187854e-01 6.01812146e-01]
 [9.95762760e-01 4.23724017e-03]
 [2.75612083e-03 9.97243879e-01]
 [9.99997097e-01 2.90271401e-06]
 [9.99926506e-01 7.34935682e-05]
 [9.99999997e-01 2.78313939e-09]
 [9.98738365e-01 1.26163489e-03]
 [9.81405399e-01 1.85946008e-02]
 [1.77902039e-02 9.82209796e-01]
 [9.65876713e-04 9.99034123e-01]
 [9.99464578e-01 5.35421808e-04]
 [6.73385015e-04 9.99326615e-01]
 [5.50833875e-05 9.99944917e-01]
 [9.69828919e-01 3.01710813e-02]
 [1.62119075e-03 9.98378809e-01]
 [9.99997821e-01 2.17867101e-06]
 [6.005712

In [22]:
# 정밀도
from sklearn.metrics import precision_score
precision = precision_score(y_te, pred_logistic)
print(precision)

0.9666666666666667


In [23]:
# confusion matrix 확인 
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_logistic)
print(conf_matrix)

[[50  3]
 [ 3 87]]


In [24]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_logistic)
print(class_report)

              precision    recall  f1-score   support

           0       0.94      0.94      0.94        53
           1       0.97      0.97      0.97        90

    accuracy                           0.96       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143



# 나이브 베이즈(추후 다시)

```python
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# 데이터 불러오기
raw_wine = datasets.load_wine()

# 피쳐, 타겟 데이터 지정
X = raw_wine.data
y = raw_wine.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te=train_test_split(X,y,random_state=0)

# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std  = std_scale.transform(X_te)

# 나이브 베이즈 학습
clf_gnb = GaussianNB()
clf_gnb.fit(X_tn_std, y_tn)

# 예측
pred_gnb = clf_gnb.predict(X_te_std)
print(pred_gnb)

# 리콜
recall = recall_score(y_te, pred_gnb, average='macro')
print(recall)

# confusion matrix 확인 
conf_matrix = confusion_matrix(y_te, pred_gnb)
print(conf_matrix)

# 분류 레포트 확인
class_report = classification_report(y_te, pred_gnb)
print(class_report)
```

# 의사결정나무(추후 다시)

- 테스트 성능 평가는 엔트로피 이용

- 엔트로피는 불순도(노드에 서로 다른 데이터가 얼마나 섞여 있는지) 정도를 측정하며 낮을수록 좋다. 

$$Entropy(d) = - \sum p(x) log P(x)$$ 

$$= - \sum_{i=1}^k p(i|d)log_2(p(i|d))$$

In [25]:
# 데이터 불러오기
from sklearn import datasets
raw_wine = datasets.load_wine()

# 피쳐, 타겟 데이터 지정
X = raw_wine.data
y = raw_wine.target

# 트레이닝/테스트 데이터 분할
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te=train_test_split(X,y,random_state=0)

# 데이터 표준화
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std  = std_scale.transform(X_te)


# 의사결정나무 학습
from sklearn import tree 
clf_tree = tree.DecisionTreeClassifier(random_state=0)
clf_tree.fit(X_tn_std, y_tn)



In [26]:
# 예측
pred_tree = clf_tree.predict(X_te_std)
print(pred_tree)



[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 2 0 1 0 1 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 1 0 1 1 1]


In [27]:
# f1 score
from sklearn.metrics import f1_score
f1 = f1_score(y_te, pred_tree, average='macro')
print(f1)

0.9349141206870346


In [28]:
# confusion matrix 확인 
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_tree)
print(conf_matrix)

[[14  2  0]
 [ 0 20  1]
 [ 0  0  8]]


In [29]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_tree)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      0.88      0.93        16
           1       0.91      0.95      0.93        21
           2       0.89      1.00      0.94         8

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.94      0.93      0.93        45



# 서포트 벡터 머신(추후 다시)

# 크로스 밸리데이션(추후 다시)