In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt

In [2]:
# 데이터 불러오기 
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [3]:
print(train_df.shape)
# (batch,컬럼) 형태의 구조
train_df

(60000, 786)


Unnamed: 0,index,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,6,0,0,0,0,0,0,0,5,...,0,0,0,30,43,0,0,0,0,0
3,3,0,0,0,0,1,2,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,59995,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,59996,1,0,0,0,0,0,0,0,0,...,73,0,0,0,0,0,0,0,0,0
59997,59997,8,0,0,0,0,0,0,0,0,...,160,162,163,135,94,0,0,0,0,0
59998,59998,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
test_df

Unnamed: 0,index,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,0,0,0,0,0,0,0,0,9,8,...,103,87,56,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,34,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,14,53,99,...,0,0,0,0,63,53,31,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,137,126,140,0,133,224,222,56,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,0,0,0,0,0,0,0,0,0,...,32,23,14,20,0,0,1,0,0,0
9996,9996,0,0,0,0,0,0,0,0,0,...,0,0,0,2,52,23,28,0,0,0
9997,9997,0,0,0,0,0,0,0,0,0,...,175,172,172,182,199,222,42,0,1,0
9998,9998,0,1,3,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [5]:
#학습에 사용하기 위해서는 float형태로 바뀌어야 한다.
train_data = np.array(train_df.iloc[:,1:], dtype = 'float32') #인덱스를 제외하고 넣기
test_data = np.array(test_df.iloc[:,1:], dtype='float32')

In [6]:
x_train = train_data[:, 1:] # 피처들
y_train = train_data[:,0]  # 라벨

x_test = test_data

In [7]:
#validation set 두기 (정확한 평가를 위해서)
#주로 하이퍼 파라미터 찾기 위해서 validation set을 둔다.
from sklearn.model_selection import train_test_split
x_train,x_validate,y_train,y_validate = train_test_split(x_train,y_train,test_size = 0.2,random_state = 12345)

print(x_train.shape, x_validate.shape)


(48000, 784) (12000, 784)


In [8]:
# reshape를 통해 다시 이미지로 만들어 주기.

x_train = x_train.reshape(-1,28,28)

x_validate = x_validate.reshape(-1,28,28)

x_test = x_test.reshape(-1,28,28)



In [9]:
#모델 구축
from multiprocessing import pool
from keras.models import Sequential
from keras.layers import Dense,BatchNormalization,Dropout,Activation
from keras.layers import Conv2D,MaxPooling2D,Flatten # 이미지 관련 함수들


nn_model = Sequential([
    Conv2D(filters=32,kernel_size=(3,3),input_shape=(28,28,1),padding='same'), #이미지 채널 : 1 -> 16
    MaxPooling2D(pool_size=2), #2x2 , stride=1
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.2),#무작위로 20%의 노드를 0으로 만든다.

    Conv2D(filters=32,kernel_size=(3,3),input_shape=(28,28,1),padding='same'), #이미지 채널 : 16 -> 32
    MaxPooling2D(pool_size=2),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.2),

    Conv2D(filters=64,kernel_size=(3,3),input_shape=(28,28,1),padding='same'), #이미지 채널 : 32 -> 16 
    MaxPooling2D(pool_size=2),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.2),
    Flatten(), #이미지를 한줄로 펴기
    Dense(512),
    BatchNormalization(),
    Activation('relu'),
    Dense(256),
    BatchNormalization(),
    Activation('relu'),
    Dense(128),
    Dense(10,activation='softmax'), #32 ->10. 10 종류 classification
])
#feature는 아니지만, 깊게 노드를 연결하면 스스로 feature를 추출할 수 있게 된다는 뜻.

In [10]:
nn_model.summary() # 이미지 출력 결과 크기가 어떤지 확인할 수 있다.

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 28, 28, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 14, 14, 32)       0         
 )                                                               
                                                                 
 batch_normalization (BatchN  (None, 14, 14, 32)       128       
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 14, 14, 32)        0         
                                                                 
 dropout (Dropout)           (None, 14, 14, 32)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 14, 14, 32)        9

In [11]:
nn_model(np.random.rand(1,28,28)).shape

TensorShape([1, 10])

In [12]:
#compile이란 우리가 만든 모델을 어떻게 weight들을 최적화 시킬지 정해주는 부분.
from keras.optimizers import Adam
nn_model.compile(loss ='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001),metrics =['accuracy'])
#Adam은 가중치를 학습 시켜주는 optimizer중 가장 유명한 것.
#learning rate는 데이터, 전처리 방법에 따라 달라진다. 스스로 조절해가며 찾아야한다.
#metrics는 학습하면서 평가해주는 중간결과를 보여줄 때 사용.

#sparse_categorical_crossentropy는 0~10으로된 수치형 라벨을 10개로 만들어진 백터와 매칭시켜준다.

#categorical_crossentropy를 쓰면,0~10칸으로 된 라벨을 10개 벡터와 매칭.


In [13]:
nn_model.fit(
    x_train,
    y_train,
    batch_size=32,
    epochs=30,
    verbose=1,
    validation_data=(x_validate,y_validate),
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x2e2295eb0d0>

In [14]:
y_pred = nn_model.predict(x_test)



In [15]:
y_pred.shape
#10칸에 대해, 확률들을 적어 둔 것.

(10000, 10)

In [16]:
y_pred

array([[9.9986780e-01, 1.5257410e-09, 1.3501657e-06, ..., 2.7550735e-08,
        1.0020402e-06, 2.1747737e-09],
       [2.7018580e-20, 1.0000000e+00, 9.0104284e-24, ..., 9.0769412e-24,
        1.5010866e-22, 4.2881899e-21],
       [5.4105936e-04, 2.4505408e-07, 9.8815697e-01, ..., 3.7123689e-07,
        7.2105951e-07, 4.1997623e-07],
       ...,
       [4.8753522e-11, 1.6068858e-12, 1.9737674e-13, ..., 3.3960207e-12,
        1.0000000e+00, 5.3064549e-12],
       [6.9339594e-06, 2.0489825e-08, 6.7479142e-09, ..., 1.2733983e-07,
        9.9997067e-01, 1.3947000e-05],
       [1.5339785e-07, 9.9999845e-01, 5.4894368e-08, ..., 4.2502553e-09,
        9.5667843e-08, 3.9063828e-08]], dtype=float32)

In [17]:
submission = pd.read_csv('./data/sample_submission.csv', encoding = 'utf-8')
submission['label'] = y_pred.argmax(axis=1)
submission.to_csv('fashion_submission.csv', index = False)

In [18]:
submission['label'].max()

9

In [19]:

submission

Unnamed: 0,index,label
0,0,0
1,1,1
2,2,2
3,3,2
4,4,3
...,...,...
9995,9995,0
9996,9996,6
9997,9997,8
9998,9998,8


점수를 높일 수 있는 시도들
1. EPOCH 변화
2. 레이어 변화
3. learning rate 변화
4. ensemble