### 지도학습

#### 분류 - 이진분류

In [1]:
import pandas as pd

In [2]:
df_TFD = pd.read_csv('../../../datasets/TitanicFromDisaster_train.csv')
df_TFD[:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### 전처리

In [3]:
df_TFD.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [10]:
df_TFD_extract = df_TFD[['Survived', 'Pclass', 'Age']]
df_TFD_extract.isnull().sum()

Survived      0
Pclass        0
Age         177
dtype: int64

In [11]:
# age null값 존재 -> regression으로 null값을 채울 수 있음. 여기선 drop함
df_TFD_extract_preprocess = df_TFD_extract.dropna()
df_TFD_extract_preprocess[:2]

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0


#### 정형화 단계

In [13]:
target_train = df_TFD_extract_preprocess['Survived']
features_train = df_TFD_extract_preprocess[['Pclass', 'Age']]   # label(=feature)
target_train.shape, features_train.shape

((714,), (714, 2))

#### 모델학습

In [15]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()  # 인스턴스화
model.fit(features_train, target_train)  # 모델 훈련은 fit() fuction을 이용함 -> fit(feature, target)

In [16]:
model.coef_, model.intercept_

(array([[-1.22653571, -0.04149665]]), array([3.532956]))

#### 예측

In [20]:
# 값을 넣어서 확인해보기
df_TFD_extract_preprocess[10:15]   # index가 features_train과 같다.

Unnamed: 0,Survived,Pclass,Age
11,1,1,58.0
12,0,3,20.0
13,0,3,39.0
14,0,3,14.0
15,1,2,55.0


In [25]:
model.predict(features_train[10:15])
# 위에서는 0이 3개 1이 2개
# 결과: array([0, 0, 0, 0, 0]) 60% 맞음

array([0, 0, 0, 0, 0], dtype=int64)

In [24]:
# predict_proba() : 확률을 알 수 있음
model.predict_proba(features_train[10:15])

# [0.52507531, 0.47492469] -> 앞은 0에 대한 열, 뒤는 1에 대한 열
# 0에 대한 확률이 더 높기 때문에 위에서 0으로 결과가 나온 것

array([[0.52507531, 0.47492469],
       [0.72642991, 0.27357009],
       [0.85383733, 0.14616267],
       [0.67427932, 0.32572068],
       [0.768957  , 0.231043  ]])

#### 평가

In [28]:
target_predict = model.predict(features_train)
target_predict
# 학습된 머신러닝에 데이터를 그대로 넣어줌 -> series로 결과 return

array([0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,

In [29]:
from sklearn.metrics import accuracy_score

In [31]:
# 정확도 평가
accuracy_score(target_train, target_predict)

0.696078431372549