## ※ Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

## 1.ionosphere.csv 로드

In [3]:
df = pd.read_csv('ionosphere.csv')
df.head()

Unnamed: 0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1,0.0376,0.85243.1,-0.17755,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,1.1
0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,0.50874,-0.67743,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,0
1,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,0.73082,0.05346,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,1
2,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,0
3,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,0.52798,-0.20275,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,1
4,0.02337,-0.00592,-0.09924,-0.11949,-0.00763,-0.11824,0.14706,0.06637,0.03786,-0.06302,...,-0.01535,-0.0324,0.09223,-0.07859,0.00732,0.0,0.0,-0.00039,0.12011,0


In [4]:
ion = df.values

In [6]:
# y는 마지막 열, X는 y를 제외한 모든 열
X = ion[:, :32]
y = ion[:, -1]

In [7]:
# X shape 출력
print(X.shape)

(350, 32)


In [8]:
# y shape 출력
print(y.shape)

(350,)


## 2.학습데이터와 테스트 데이터 분리

In [9]:
# 전체 데이터의 80%는 학습 데이터, 20%는 테스트 데이터로 분리
# X_train: 학습데이터의 X
# y_train: 학습데이터의 y
# X_test: 테스트데이터의 X
# y_test: 테스트데이터의 y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
# X_train (학습데이터의 X) shape
print(X_train.shape)

(280, 32)


In [11]:
# y_train (학습데이터의 y) shape
print(y_train.shape)

(280,)


In [12]:
# X_test (테스트데이터의 X) shape
print(X_test.shape)

(70, 32)


In [13]:
# y_test (테스트데이터의 X) shape
print(y_test.shape)

(70,)


## 3.학습데이터를 이용하여 Logistic Regression model 학습

In [14]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

## 4.테스트데이터의 X를 model에 적용하여 예측값(y) 구하기

In [15]:
pred = lr.predict(X_test)
print(pred)

[0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1.
 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0.]


## 5.성능 평가

### 5-1. Confusion matrix 구하기

In [17]:
# Confusion matrix 출력
cm = confusion_matrix(y_test, pred)
print(cm)

[[24  4]
 [ 1 41]]


### 5-2. Accuracy와 error rate 구하기

In [18]:
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

In [19]:
# Accuracy 출력 (%)
acc = (TP+TN) / (TP+TN+FP+FN) * 100
print('Accuracy:', round(acc, 2), '%')

Accuracy: 92.86 %


In [21]:
# Error rate 출력 (%)
err = (FP+FN) / (TP+TN+FP+FN) * 100
print('Error rate:', round(err, 2), '%')

Error rate: 7.14 %


### 5-3. Precision과 Recall 구하기

In [22]:
# Precision 출력 (%)
precision = TP / (TP+FP) * 100
print('Precision:', round(precision, 2), '%')

Precision: 91.11 %


In [23]:
# Recall 출력 (%)
recall = TP / (TP+FN) * 100
print('Recall:', round(recall, 2), '%')

Recall: 97.62 %


## [서술형] 5번 성능평가 결과에 대해 간단히 설명하시오.

[작성]
모델이 true라고 분류한 것 중 실제 true의 비율인 정밀도(Precision)는 91.11%로 나타났으며, 실제 true인 것 중 모델이 true라고 예측한 비율인 재현율(Recall)은 97.62%로 나타났고, 모델이 예측한 것이 맞은 비율인 정확도(Accuracy)는 92.86%로 나타났다.