In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# 데이터 준비

In [3]:
df = sns.load_dataset("titanic")

In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# 데이터 탐색

In [9]:
df.shape

(891, 15)

In [11]:
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [15]:
# 결측치가 많은 deck열은 삭제, embarked와 내용이 겹치는 embark_town 열은 삭제
rdf = df.drop(["deck", "embark_town"], axis = 1)

In [17]:
rdf.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'alive', 'alone'],
      dtype='object')

In [19]:
# age 열에 결측치가 있는 모든 행을 삭제 (177개 행)
rdf = rdf.dropna(subset = ["age"], axis = 0)

In [21]:
len(df), len(rdf)

(891, 714)

In [23]:
len(df) - len(rdf)

177

In [25]:
# embarked 열의 NaN 값을 승선도시 중에서 가장 많이 출현한 값으로 치환하기
rdf["embarked"].value_counts()

embarked
S    554
C    130
Q     28
Name: count, dtype: int64

In [27]:
most_freq = rdf["embarked"].value_counts().idxmax()    # 최대값의 인덱스
most_freq

'S'

In [29]:
rdf["embarked"] = rdf["embarked"].fillna(most_freq)

In [31]:
rdf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,no,True


In [33]:
# rdf의 각 컬럼의 결측값의 개수
rdf.isna().sum()

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
class         0
who           0
adult_male    0
alive         0
alone         0
dtype: int64

# 분석에 사용할 속성을 선택

In [36]:
ndf = rdf[["survived", "pclass", "sex", "age", "sibsp", "parch", "embarked"]]

In [38]:
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [40]:
# 원핫인코딩 - 범주형 데이터를 인식할 수 있도록 숫자형으로 변환
onehot_sex = pd.get_dummies(ndf["sex"], dtype = int)

In [42]:
onehot_sex

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
885,1,0
886,0,1
887,1,0
889,0,1


In [43]:
# 데이터 연결 (열 방향 연결)
ndf = pd.concat([ndf, onehot_sex], axis = 1)

In [44]:
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male
0,0,3,male,22.0,1,0,S,0,1
1,1,1,female,38.0,1,0,C,1,0
2,1,3,female,26.0,0,0,S,1,0
3,1,1,female,35.0,1,0,S,1,0
4,0,3,male,35.0,0,0,S,0,1


In [47]:
onehot_embarked = pd.get_dummies(ndf["embarked"], prefix = "town", dtype = int)

In [49]:
onehot_embarked.head()

Unnamed: 0,town_C,town_Q,town_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [52]:
ndf = pd.concat([ndf, onehot_embarked], axis = 1)

In [54]:
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male,town_C,town_Q,town_S
0,0,3,male,22.0,1,0,S,0,1,0,0,1
1,1,1,female,38.0,1,0,C,1,0,1,0,0
2,1,3,female,26.0,0,0,S,1,0,0,0,1
3,1,1,female,35.0,1,0,S,1,0,0,0,1
4,0,3,male,35.0,0,0,S,0,1,0,0,1


In [56]:
# 성별, 승선도시 컬럼 제거
ndf = ndf.drop(["sex", "embarked"], axis = 1)

In [58]:
ndf.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,female,male,town_C,town_Q,town_S
0,0,3,22.0,1,0,0,1,0,0,1
1,1,1,38.0,1,0,1,0,1,0,0
2,1,3,26.0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,1,0,0,0,1
4,0,3,35.0,0,0,0,1,0,0,1


# 데이터셋 분할 - 훈련 / 테스트

In [61]:
x = ndf.drop("survived", axis = 1)    # 독립변수 x
y = ndf["survived"]    # 종속변수 y

In [63]:
# 7 : 3 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify = y, random_state = 26)

In [65]:
len(x_train), len(x_test)

(499, 215)

# KNN 분류

In [68]:
# KNN 분류 모형 객체 생성 (k = 5로 설정)
knn = KNeighborsClassifier(n_neighbors = 5)

In [70]:
# 모형 학습
knn.fit(x_train, y_train)

In [72]:
# test data에 대한 예측
y_pred = knn.predict(x_test)

In [74]:
pd.DataFrame({"ans" : y_test, "pred" : y_pred})

Unnamed: 0,ans,pred
625,0,0
471,0,0
670,1,1
98,1,1
447,1,0
...,...,...
856,1,0
500,0,0
862,1,0
809,1,1


In [76]:
# Confusion Matrix
knn_matrix = confusion_matrix(y_test, y_pred)

In [78]:
# 0인 데이터를 0이라 한 경우, 0인 데이터를 1이라 한 경우
# 1인 데이터를 0이라 한 경우, 1인 데이터 를 1이라 한 경우
print(knn_matrix)

[[115  13]
 [ 31  56]]


In [80]:
# 이진분류 평가지표 계산
knn_report = classification_report(y_test, y_pred)

In [82]:
print(knn_report)

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       128
           1       0.81      0.64      0.72        87

    accuracy                           0.80       215
   macro avg       0.80      0.77      0.78       215
weighted avg       0.80      0.80      0.79       215



- precision(정밀도)
  - 양성으로 예측한 데이터 중 실제로 양성인 데이터의 비율
- recall(재현율)
  - 실제 양성인 데이터 중 양성으로 예측한 비율
- 코로나 검사키트의 경우
  - 민감도 90% 이상
    - 실제 양성인 사람이 코로나 검사를 하면 양성으로 예측될 확률 90% 이상
  - 특이도 99% 이상
    - 실제 음성인 사람이 코로나 검사를 하면 음성으로 예측될 확률 99% 이상

# 모델 고도화

In [86]:
y.value_counts()

survived
0    424
1    290
Name: count, dtype: int64

In [88]:
## Standard Scailing
ss = StandardScaler()

In [90]:
scaled_train = ss.fit_transform(x_train)
scaled_test = ss.transform(x_test)

In [92]:
knn = KNeighborsClassifier(n_neighbors = 5)

In [94]:
knn.fit(scaled_train, y_train)

In [96]:
y_pred = knn.predict(scaled_test)

In [98]:
pd.DataFrame({"ans" : y_test, "pred" : y_pred})

Unnamed: 0,ans,pred
625,0,0
471,0,0
670,1,1
98,1,1
447,1,0
...,...,...
856,1,1
500,0,0
862,1,1
809,1,1


In [100]:
knn_matrix = confusion_matrix(y_test, y_pred)
print(knn_matrix)

[[116  12]
 [ 30  57]]


In [102]:
knn_report = classification_report(y_test, y_pred)
print(knn_report)

              precision    recall  f1-score   support

           0       0.79      0.91      0.85       128
           1       0.83      0.66      0.73        87

    accuracy                           0.80       215
   macro avg       0.81      0.78      0.79       215
weighted avg       0.81      0.80      0.80       215

