In [1]:
import pandas as pd 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# 데이터 준비

In [2]:
df = sns.load_dataset("titanic")

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# 데이터 탐색

In [4]:
df.shape

(891, 15)

In [5]:
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [6]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [7]:
# 결측치가 많은 deck열은 삭제, embark와 내용이 겹치는 embark_town 열을 삭제
rdf = df.drop(["deck", "embark_town"], axis=1)

In [8]:
rdf.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'alive', 'alone'],
      dtype='object')

In [9]:
# age 열에 결츶치가 있는 모든 행을 삭제
rdf = rdf.dropna(subset= ["age"], axis = 0)

In [10]:
rdf.shape, df.shape

((714, 13), (891, 15))

In [11]:
# embarked 열의 nan값을 승선도시 중에서 가장 많이 출현한 값으로 치환하기
most_freq = rdf["embarked"].value_counts().idxmax()
most_freq

'S'

In [12]:
rdf.describe(include="all")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone
count,714.0,714.0,714,714.0,714.0,714.0,714.0,712,714,714,714,714,714
unique,,,2,,,,,3,3,3,2,2,2
top,,,male,,,,,S,Third,man,True,no,True
freq,,,453,,,,,554,355,413,413,424,404
mean,0.406162,2.236695,,29.699118,0.512605,0.431373,34.694514,,,,,,
std,0.49146,0.83825,,14.526497,0.929783,0.853289,52.91893,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,
25%,0.0,1.0,,20.125,0.0,0.0,8.05,,,,,,
50%,0.0,2.0,,28.0,0.0,0.0,15.7417,,,,,,
75%,1.0,3.0,,38.0,1.0,1.0,33.375,,,,,,


In [13]:
rdf["embarked"] = rdf["embarked"].fillna(most_freq)

In [14]:
rdf.isna().sum()

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
class         0
who           0
adult_male    0
alive         0
alone         0
dtype: int64

In [15]:
rdf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,no,True


In [16]:
rdf.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'alive', 'alone'],
      dtype='object')

# 분석에 사용할 속성을 선택

In [17]:
ndf = rdf[["survived", "pclass", "sex", "age", "sibsp", "parch", "embarked"]]

In [18]:
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [19]:
# 원핫 인코딩
onehot_sex = pd.get_dummies(ndf["sex"])

In [20]:
onehot_sex

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True
...,...,...
885,True,False
886,False,True
887,True,False
889,False,True


In [21]:
ndf = pd.concat([ndf, onehot_sex], axis = 1)

In [22]:
onehot_embarked = pd.get_dummies(ndf["embarked"], prefix="town")

In [23]:
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male
0,0,3,male,22.0,1,0,S,False,True
1,1,1,female,38.0,1,0,C,True,False
2,1,3,female,26.0,0,0,S,True,False
3,1,1,female,35.0,1,0,S,True,False
4,0,3,male,35.0,0,0,S,False,True


In [24]:
onehot_embarked.head()

Unnamed: 0,town_C,town_Q,town_S
0,False,False,True
1,True,False,False
2,False,False,True
3,False,False,True
4,False,False,True


In [25]:
ndf = pd.concat([ndf, onehot_embarked], axis=1)

In [26]:
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male,town_C,town_Q,town_S
0,0,3,male,22.0,1,0,S,False,True,False,False,True
1,1,1,female,38.0,1,0,C,True,False,True,False,False
2,1,3,female,26.0,0,0,S,True,False,False,False,True
3,1,1,female,35.0,1,0,S,True,False,False,False,True
4,0,3,male,35.0,0,0,S,False,True,False,False,True


In [27]:
ndf = ndf.drop(["sex", "embarked"], axis=1)

In [28]:
ndf.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,female,male,town_C,town_Q,town_S
0,0,3,22.0,1,0,False,True,False,False,True
1,1,1,38.0,1,0,True,False,True,False,False
2,1,3,26.0,0,0,True,False,False,False,True
3,1,1,35.0,1,0,True,False,False,False,True
4,0,3,35.0,0,0,False,True,False,False,True


# 데이터셋 분할

In [29]:
x = ndf.drop("survived", axis=1) # 독립변수 x
y = ndf["survived"] # 종속변수 y

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=22)

In [31]:
x_train.shape, x_test.shape

((499, 9), (215, 9))

# KNN 분류

In [36]:
# KNN 분류 모형 객체 생성( k =5로 설정)
knn = KNeighborsClassifier(n_neighbors=5)

In [37]:
# train data로 모형 학습
knn.fit(x_train, y_train)

In [38]:
# test data로 예측
y_pred = knn.predict(x_test)

In [39]:
pd.DataFrame({"ans" : y_test, "pred" : y_pred})

Unnamed: 0,ans,pred
131,0,0
16,0,0
663,0,0
526,1,0
84,1,1
...,...,...
707,1,0
570,1,0
308,0,0
333,0,0


In [40]:
# 모형 성능 평가 - confusion Matrix 계산
knn_matrix = confusion_matrix(y_test, y_pred)

In [42]:
# 0인 레이블을 0이라고 한 경우, 0인 레이블을 1이라 한 경우,
# 1인 레이블을 0이라고 한 경우, 1인 레이블을 1이라 한 경우
knn_matrix

array([[105,  23],
       [ 24,  63]], dtype=int64)

In [43]:
knn_report = classification_report(y_test, y_pred)

In [44]:
print(knn_report)

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       128
           1       0.73      0.72      0.73        87

    accuracy                           0.78       215
   macro avg       0.77      0.77      0.77       215
weighted avg       0.78      0.78      0.78       215



- precision(정밀도)
    - 양성으로 예측한 데이터 중 실제로 양성인 데이터의 비율

- recall(재현율)
    - 실제 양성인 데이터 중 양성으로 예측한 비율

- 코로나 검사시트의 경우
    - 민감도 90% 이상
        - 실제 양성인 사람이 코로나 검사를 하면 양성으로 예측될 확률 90% 이상
    
    - 특이도 99% 이상
        - 실제 음성인 사람이 코로나 검사를 하면 음성으로 예측될 경우 99% 이상

# 모델 고도화

In [45]:
y.value_counts()

survived
0    424
1    290
Name: count, dtype: int64

In [46]:
ss = StandardScaler()

In [47]:
scaled_train = ss.fit_transform(x_train)
scaled_test = ss.transform(x_test)

In [48]:
scaled_train[0]

array([ 0.91222507, -2.06649957,  1.86696459,  0.66263909,  1.34289642,
       -1.34289642,  2.11743158, -0.20960213, -1.8588591 ])

In [49]:
knn = KNeighborsClassifier(n_neighbors=5)

In [50]:
knn.fit(scaled_train, y_train)

In [51]:
y_pred = knn.predict(scaled_test)

In [52]:
pd.DataFrame({"ans" : y_test, "pred" : y_pred})

Unnamed: 0,ans,pred
131,0,0
16,0,0
663,0,0
526,1,1
84,1,1
...,...,...
707,1,0
570,1,0
308,0,0
333,0,0


In [53]:
knn_matrix = confusion_matrix(y_test, y_pred)
knn_matrix

array([[109,  19],
       [ 22,  65]], dtype=int64)

In [54]:
knn_report = classification_report(y_test, y_pred)
print(knn_report)

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       128
           1       0.77      0.75      0.76        87

    accuracy                           0.81       215
   macro avg       0.80      0.80      0.80       215
weighted avg       0.81      0.81      0.81       215

