## INSTALL ANACONDA
1. Mac에 아나콘다(python + libraries package)를 설치합니다.  
아래 링크에서 설치파일을 다운로드 받고 설치를 진행해주세요.  
https://www.anaconda.com/download/#macos
2. 파이선 버전을 확인해봅니다.  
\$ python -V  
Python 3.6.5 :: Anaconda, Inc.

## CONDA 가상환경 만들기
1. 실습을 위해 텐서플로우 및 기타 패키지들을 설치하기 위한 가상환경을 만듭니다.  
\$ conda create -n tensorflow
2. 이제 가상환경으로 들어가봅시다.  
\$ source activate tensorflow  
3. Prompt 모양이 아래처럼 (tensorflow)로 바뀐 것을 확인  
(tensorflow) ~/olaf/_

## TENSORFLOW 설치
\$ pip install --upgrade tensorflow  
또는  
\$ pip install --upgrade https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl

## 실습 코드 다운 받기
\$ cd \${실습_HOME}  
\$ git clone https://github.com/kidokim509/kaggle_titanic.git 
\$ cd titanic

## JUPYTER NOTEBOOK 실행하기
\$ jupyter notebook

## 지금부터 TENSORFLOW 실습 시작!

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
tf.VERSION

'1.8.0'

### TENSORFLOW API 종류
https://www.tensorflow.org/get_started/premade_estimators  
<img src="https://www.tensorflow.org/images/tensorflow_programming_environment.png" style="width: 70%">

### 학습 데이터 전처리
https://www.kaggle.com/c/titanic/data

In [3]:
data = pd.read_csv('train.csv', index_col='PassengerId')
data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
data.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
# 불필요한 필드 제거
data = data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [6]:
data.head(10)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S
6,0,3,male,,0,0,8.4583,Q
7,0,1,male,54.0,0,0,51.8625,S
8,0,3,male,2.0,3,1,21.075,S
9,1,3,female,27.0,0,2,11.1333,S
10,1,2,female,14.0,1,0,30.0708,C


In [7]:
# Features가 Null인 레코드 제거
data = data[pd.notnull(data['Age'])]
data = data[pd.notnull(data['Embarked'])] 

# 카테고리형 변수를 숫자로 변환
data = data.replace(["female", "male"], [0, 1])
data = data.replace(["Q", "C", "S"], [0, 1, 2])
data_y = pd.get_dummies(data.pop('Survived').values) # 2자리 벡터로 분리

In [8]:
data.head(10)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,1,22.0,1,0,7.25,2
2,1,0,38.0,1,0,71.2833,1
3,3,0,26.0,0,0,7.925,2
4,1,0,35.0,1,0,53.1,2
5,3,1,35.0,0,0,8.05,2
7,1,1,54.0,0,0,51.8625,2
8,3,1,2.0,3,1,21.075,2
9,3,0,27.0,0,2,11.1333,2
10,2,0,14.0,1,0,30.0708,1
11,3,0,4.0,1,1,16.7,2


In [9]:
data_y.head(10)

Unnamed: 0,0,1
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0
5,1,0
6,1,0
7,0,1
8,0,1
9,0,1


In [10]:
# 컬럼 값들의 편차가 크기 때문에 컬럼별 평균 값으로 나누어 normalization
data = data / data.mean()
data.head(10)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1.339185,1.571744,0.742188,1.945355,0.0,0.209736,1.150242
2,0.446395,0.0,1.281961,1.945355,0.0,2.062163,0.575121
3,1.339185,0.0,0.877131,0.0,0.0,0.229263,1.150242
4,0.446395,0.0,1.180753,1.945355,0.0,1.536136,1.150242
5,1.339185,1.571744,1.180753,0.0,0.0,0.232879,1.150242
7,0.446395,1.571744,1.821734,0.0,0.0,1.500336,1.150242
8,1.339185,1.571744,0.067472,5.836066,2.311688,0.609681,1.150242
9,1.339185,0.0,0.910867,0.0,4.623377,0.322077,1.150242
10,0.89279,0.0,0.472301,1.945355,0.0,0.869922,0.575121
11,1.339185,0.0,0.134943,1.945355,2.311688,0.483116,1.150242


In [11]:
# Train Set, Validation Set
train_size = round(len(data) * 0.8)

train_set_x = data[:train_size]
train_set_y = data_y[:train_size]

valid_set_x = data[train_size:]
valid_set_y = data_y[train_size:]

### Multi Layer Perceptron 만들기
<img src="./mlp1.png" style="width: 40%">

### What is a tensor?
https://www.tensorflow.org/programmers_guide/tensors

* Tensor: A tensor is a generalization of vectors and matrices to potentially higher dimensions.
* Rank: Tensor의 차원 수  
0: Scalar (magnitude only)  
1: Vector (magnitude and direction)  
2: Matrix (table of numbers)  
3: 3-Tensor (cube of numbers)  
n: n-Tensor (you get the idea)  
* Shape: Tensor의 사이즈  
Higher-rank Tensors, similarly, consist of an n-dimensional array. For example, during image processing, many tensors of rank 4 are used, with dimensions corresponding to example-in-batch, image width, image height, and color channel.

In [12]:
# INPUT LAYER
X = tf.placeholder(tf.float32, [None, 7], name='input')

# 정답셋
Y = tf.placeholder(tf.float32, [None, 2], name='output')

In [13]:
X

<tf.Tensor 'input:0' shape=(?, 7) dtype=float32>

In [14]:
# HIDDEN LAYER 1 선언
# Weight & Bias
W1 = tf.Variable(tf.random_uniform([7, 10], minval=-1., maxval=1.), name='hidden1_weight')
B1 = tf.Variable(tf.zeros([10]), name='hidden1_bias')
# X*W + B
L1 = tf.add(tf.matmul(X, W1), B1)
# Activation
L1 = tf.nn.relu(L1)

# HIDDEN LAYER 2 선언
W2 = tf.Variable(tf.random_uniform([10, 2], minval=-1., maxval=1.), name='hidden2_weight')
B2 = tf.Variable(tf.zeros([2]), name='hidden2_bias')
# X*W + B
L2 = tf.add(tf.matmul(L1, W2), B2)
# Activation
L2 = tf.nn.relu(L2)

In [15]:
# OUTPUT LAYER 연산: softmax
# (https://www.tensorflow.org/api_docs/python/tf/nn)
prediction = tf.nn.softmax(L2)

In [16]:
# COST/LOSS FUNCTION: 모든 input들의 cross-entropy 계산 값의 평균치
cost = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(prediction), axis=1))

In [17]:
# OPTIMIZER: https://www.tensorflow.org/api_docs/python/tf/train
# GRADIENT DESCENT
lr = 0.01
optimizer = tf.train.GradientDescentOptimizer(lr).minimize(cost)

In [18]:
# ACCURACY
is_correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))

In [19]:
# 드디어 학습!!!!
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
max_epoch = 1000

for epoch in range(max_epoch):
    sess.run(optimizer, feed_dict={X: train_set_x, Y: train_set_y})

    if (epoch + 1) % 100 == 0:
        print("epoch: {}, train cost: {}, validation cost: {}".format(
            epoch + 1, 
            sess.run(cost, feed_dict={X: train_set_x, Y: train_set_y}),
            sess.run(cost, feed_dict={X: valid_set_x, Y: valid_set_y})
        ))
    if (epoch + 1) == max_epoch:
        print("validation accuracy: {}%".format(
            round(100*sess.run(accuracy, feed_dict={X: valid_set_x, Y: valid_set_y}), 2)
        ))

epoch: 100, train cost: 0.6658826470375061, validation cost: 0.6169701814651489
epoch: 200, train cost: 0.651698112487793, validation cost: 0.5951818823814392
epoch: 300, train cost: 0.6418927311897278, validation cost: 0.5809418559074402
epoch: 400, train cost: 0.6327864527702332, validation cost: 0.5683927536010742
epoch: 500, train cost: 0.6229400038719177, validation cost: 0.5565431714057922
epoch: 600, train cost: 0.6125432252883911, validation cost: 0.5450144410133362
epoch: 700, train cost: 0.6037550568580627, validation cost: 0.53626549243927
epoch: 800, train cost: 0.5951724052429199, validation cost: 0.5286828875541687
epoch: 900, train cost: 0.5872390270233154, validation cost: 0.5222192406654358
epoch: 1000, train cost: 0.5803140997886658, validation cost: 0.5168492197990417
validation accuracy: 61.27%


### Test Set으로 예측 정확도(Accuracy)를 확인해보고 싶다!

In [None]:
def preprocess(df):
    df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    df = df[pd.notnull(df['Age'])]
    df = df[pd.notnull(df['Embarked'])] 
    df = df.replace(["female", "male"], [0, 1])
    df = df.replace(["Q", "C", "S"], [0, 1, 2])
    df = df / df.mean()
    return df

test = pd.read_csv('test.csv', index_col='PassengerId')
test = preprocess(test)
test.describe()

In [None]:
sess.run(prediction, feed_dict={X: test})

## 정확도를 높여보자~!
1. Hyper Parameter 조절: Learning Rate, # of Epochs...
2. Hidden Layer 추가  
...