In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = 20, 10
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
# 지표를 하나만 설정할 경우
from sklearn.model_selection import cross_val_score
# 지표를 하나 이상 설정할 경우
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 모델의 최적의 하이퍼파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 차원축소
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 군집화
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth



# ARIMA (시계열 예측)
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm


# 시간 측정을 위한 시간 모듈
import datetime

# 주식정보
from pandas_datareader import data

# 형태소 벡터를 생성하기 위한 라이브러리
from sklearn.feature_extraction.text import CountVectorizer
# 형태소 벡터를 학습 벡터로 변환한다.
from sklearn.feature_extraction.text import TfidfTransformer


# 데이터 수집
import requests
from bs4 import BeautifulSoup
import re
import time
import os
import json

# 한국어 형태소 분석
from konlpy.tag import Okt, Hannanum, Kkma, Mecab, Komoran

# 워드 클라우드를 위한 라이브러리
from collections import Counter
import pytagcloud
from IPython.display import Image

# 저장
import pickle

# 딥러닝
import tensorflow as tf

# 딥러닝 모델 구조를 정의하는 것
from tensorflow.keras.models import Sequential
# 층구조를 정의하는 것
from tensorflow.keras.layers import Dense
# 활성화 함수를 정의하는 것
from tensorflow.keras.layers import Activation

# 현재 프로젝트를 gpu에 할당한다.
# 컴퓨터의 GPU는 메모리를 가지고 있다. 
gpus = tf.config.experimental.list_physical_devices("GPU")
# gpu가 있다면
if len(gpus) >0 :
    try :
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e :
        print(e)

pygame 2.0.1 (SDL 2.0.14, Python 3.8.5)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# 수술환자 데이터를 가져온다. 
# 제일 마지막 칼럼이 결과이며 1은 생존, 0은 사망을 의미
df1 = pd.read_csv("data/ThoraricSurgery.csv", header=None)
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,293,1,3.80,2.80,0,0,0,0,0,0,12,0,0,0,1,0,62,0
1,1,2,2.88,2.16,1,0,0,0,1,1,14,0,0,0,1,0,60,0
2,8,2,3.19,2.50,1,0,0,0,1,0,11,0,0,1,1,0,66,1
3,14,2,3.98,3.06,2,0,0,0,1,1,14,0,0,0,1,0,80,1
4,17,2,2.21,1.88,0,0,1,0,0,0,12,0,0,0,1,0,56,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,98,6,3.04,2.40,2,0,0,0,1,0,11,0,0,0,1,0,76,0
466,369,6,3.88,2.72,1,0,0,0,1,0,12,0,0,0,1,0,77,0
467,406,6,5.36,3.96,1,0,0,0,1,0,12,0,0,0,0,0,62,0
468,25,8,4.32,3.20,0,0,0,0,0,0,11,0,0,0,0,0,58,1


In [3]:
# 입력과 결과로 나눈다.
X = df1.drop(17, axis=1)
y = df1[17]

display(X)
display(y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,293,1,3.80,2.80,0,0,0,0,0,0,12,0,0,0,1,0,62
1,1,2,2.88,2.16,1,0,0,0,1,1,14,0,0,0,1,0,60
2,8,2,3.19,2.50,1,0,0,0,1,0,11,0,0,1,1,0,66
3,14,2,3.98,3.06,2,0,0,0,1,1,14,0,0,0,1,0,80
4,17,2,2.21,1.88,0,0,1,0,0,0,12,0,0,0,1,0,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,98,6,3.04,2.40,2,0,0,0,1,0,11,0,0,0,1,0,76
466,369,6,3.88,2.72,1,0,0,0,1,0,12,0,0,0,1,0,77
467,406,6,5.36,3.96,1,0,0,0,1,0,12,0,0,0,0,0,62
468,25,8,4.32,3.20,0,0,0,0,0,0,11,0,0,0,0,0,58


0      0
1      0
2      1
3      1
4      0
      ..
465    0
466    0
467    0
468    1
469    0
Name: 17, Length: 470, dtype: int64

In [4]:
# 텐서플로의 랜덤 시드를 고정한다.
# 데이터를 랜덤하게 섞은 후 학습하기 때문에 같은 결과를 보기 위해 고정한다.
tf.random.set_seed(1)

In [5]:
# 딥러닝 구조를 설정한다. 
model = Sequential()

# 첫번째 은닉층
# 노드는 30개로 설정했으며, 활성화 함수는 relu로 설정
# 첫번째 은닉층에는 입력층의 노드의 개수를 설정해야 한다.  (input_dim)
model.add(Dense(30, input_dim=17))
model.add(Activation("relu"))

# 출력 층 설정. 노드는 1개로 설정했으며 활성화 함수는 sigmoid로 설정했다. 
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [6]:
# 딥러닝 모델 구조를 컴파일한다.
# 딥러닝은 오차를 복기하여 수정한다. 
# loss : 진짜 결과와 오차가 얼마나 발생했는지 계산하는 함수를 설정
# optimizer : 오차를 수정하기 위해서 사용하는 함수이다.
# metrics : 성능을 평가하기 위해 평가 지표를 설정한다.
model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                540       
_________________________________________________________________
activation (Activation)      (None, 30)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 31        
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 571
Trainable params: 571
Non-trainable params: 0
_________________________________________________________________


In [7]:
# 학습 
# epochs : 학습의 횟수 (가중치 보정을 위해 학습을 시도하는 횟수) 
#          수치가 크면 학습을 더 많이하기떄문에 성능이 좋아지긴 하지만
    #      학습을 많이 하면 무적권 과적합이 뜬다. 근데 무조건 많이 줘라. 방법이 있음 ㅎ
# batch_size : 학습 시 메모리에 올릴 행의 개수를 의미한다.
#          데이터가 너무 많으면 메모리가 부족할 수 있다. 이때 batch_size를 줄여주면 
        # 학습 시간은 좀 더 걸리지만 메모리 부족현상을 막을 수 있다.
model.fit(X, y, epochs=30, batch_size=10)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x25f9c61fca0>

In [8]:
# 예측한다.(예측 결과를 1일 확률로 나온다.)
# model.predict_classes(X)  # 요거는 더이상 쓰면 안된다. 
pred1 = model.predict(X)
pred1

array([[1.02201361e-14],
       [4.06070054e-01],
       [4.66267377e-01],
       [3.72807682e-01],
       [2.67239362e-01],
       [1.61080077e-01],
       [8.93964544e-02],
       [3.37797850e-01],
       [5.51190898e-02],
       [8.24928284e-05],
       [2.54224597e-05],
       [8.45230534e-05],
       [4.03731883e-06],
       [2.20509100e-05],
       [4.39851000e-08],
       [2.90598996e-06],
       [2.16501257e-06],
       [3.10933660e-06],
       [1.81738017e-07],
       [9.22518097e-08],
       [8.37468335e-08],
       [3.13508401e-08],
       [3.01528011e-08],
       [7.24575622e-10],
       [5.04022768e-10],
       [3.03762820e-10],
       [1.81791665e-10],
       [7.01920883e-11],
       [8.36397618e-11],
       [9.26918698e-12],
       [1.58511388e-13],
       [3.90825508e-14],
       [1.37278249e-14],
       [3.37253219e-15],
       [1.21619648e-15],
       [3.08332581e-19],
       [1.97662007e-16],
       [7.28312743e-17],
       [1.14464777e-16],
       [4.24125014e-16],


In [12]:
result1 = (pred1 > 0.5).astype(int)
result1

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [10]:
# 진짜 결과와 얼마나 같은지 확인한다.
score1 = accuracy_score(y, result1)
score1

0.8531914893617021