# 1. Iris 데이터에 대해서 5겹 교차검증을 사용하여 분류하시오.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split, StratifiedKFold
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [2]:
seed=2020
np.random.seed(seed)
tf.random.set_seed(seed)

In [3]:
data=pd.read_csv('../dataset/iris.csv', names = ["sepal_length", "sepal_width", "petal_length",
                                              "petal_width", "species"])

In [4]:
iris=data.values
x=iris[:,0:4].astype(float)
y_obj=iris[:,4]

In [5]:
e = LabelEncoder().fit(y_obj)
y = e.transform(y_obj)

In [6]:
n_fold=5
skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
accuracy = []

In [7]:
for train, test in skf.split(x, y):
    model = Sequential([
        Dense(24, input_dim=4, activation='relu'),
        Dense(12, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy',
        optimizer='adam', metrics=['accuracy'])
    model.fit(x[train], y[train], epochs=100, batch_size=5, verbose=0)
    
    print(len(x[train]), len(x[test]), end='  ')
    k_accuracy = "%.4f" % (model.evaluate(x[test], y[test], verbose=0)[1])
    print(k_accuracy)
    accuracy.append(k_accuracy)

120 30  0.6667
120 30  0.3667
120 30  0.5000
120 30  0.4000
120 30  0.4667


In [8]:
acc = np.array(accuracy, dtype=np.float64)
np.mean(acc)

0.48001999999999995

In [9]:
test_data = np.array([5.0,3.0,4.0,2.0]).reshape(1, 4)

In [10]:
species = ['setosa','versicolor','virginica']
species[model.predict_classes(test_data)[0][0]]

'versicolor'

# 2. Fashion MNIST Dataset을 CNN으로 분류 하시오

In [11]:
import os
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [12]:
fashion_mnist = keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

In [13]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [14]:
unique, counts=np.unique(train_labels,return_counts=True)
dict(zip(unique,counts))

{0: 6000,
 1: 6000,
 2: 6000,
 3: 6000,
 4: 6000,
 5: 6000,
 6: 6000,
 7: 6000,
 8: 6000,
 9: 6000}

In [15]:
x_train = train_images.reshape(-1,28,28,1).astype("float32") / 255.0
x_test = test_images.reshape(-1,28,28,1).astype("float32") / 255.0

In [16]:
y_train=keras.utils.to_categorical(train_labels)
y_test=keras.utils.to_categorical(test_labels)

In [17]:
model = Sequential([
    Conv2D(32, kernel_size=(5, 5), input_shape=(28, 28,1), activation='relu'),
    MaxPooling2D(pool_size=2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=2),
    Flatten(),
    Dropout(0.25),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(10, activation='softmax')
])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 24, 24, 32)        832       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 12, 12, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 10, 10, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
flatten (Flatten)            (None, 1600)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1600)              0         
_________________________________________________________________
dense_15 (Dense)             (None, 128)              

In [18]:
model.compile(loss="categorical_crossentropy",
             optimizer='adam',
             metrics=['accuracy'])

In [19]:
MODEL_DIR='./model/'
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

In [20]:
modelpath=MODEL_DIR+"fashion-cnn.hdf5"
checkpointer=ModelCheckpoint(filepath=modelpath,monitor="val_loss",
                            verbose=1, save_best_only=True)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10)

In [21]:
history=model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=30,
                  batch_size=200,
                  callbacks=[early_stopping_callback, checkpointer])

Train on 60000 samples, validate on 10000 samples
Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.48507, saving model to ./model/fashion-cnn.hdf5
Epoch 2/30
Epoch 00002: val_loss improved from 0.48507 to 0.38575, saving model to ./model/fashion-cnn.hdf5
Epoch 3/30
Epoch 00003: val_loss improved from 0.38575 to 0.35420, saving model to ./model/fashion-cnn.hdf5
Epoch 4/30
Epoch 00004: val_loss improved from 0.35420 to 0.31920, saving model to ./model/fashion-cnn.hdf5
Epoch 5/30
Epoch 00005: val_loss improved from 0.31920 to 0.31688, saving model to ./model/fashion-cnn.hdf5
Epoch 6/30
Epoch 00006: val_loss improved from 0.31688 to 0.29022, saving model to ./model/fashion-cnn.hdf5
Epoch 7/30
Epoch 00007: val_loss improved from 0.29022 to 0.28032, saving model to ./model/fashion-cnn.hdf5
Epoch 8/30
Epoch 00008: val_loss improved from 0.28032 to 0.27858, saving model to ./model/fashion-cnn.hdf5
Epoch 9/30
Epoch 00009: val_loss improved from 0.27858 to 0.26861, saving model to ./model

Epoch 27/30
Epoch 00027: val_loss did not improve from 0.23911
Epoch 28/30
Epoch 00028: val_loss did not improve from 0.23911


In [22]:
del model
model = load_model('model/fashion-cnn.hdf5')
test_loss, test_acc = model.evaluate(x_test,  y_test, verbose=2)

print('\n테스트 정확도:', test_acc)

10000/10000 - 1s - loss: 0.2391 - accuracy: 0.9175

테스트 정확도: 0.9175


# 3. IMDB 영화 리뷰 데이터에 대하여 딥러닝을 이용하여 감성분석을 하시오.

In [23]:
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [24]:
df = pd.read_csv("../dataset/labeledTrainData.tsv", delimiter ='\t')
df

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


In [25]:
imdb = df.values
x=imdb[:,2]
y=imdb[:,1]

In [26]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=seed)

In [27]:
stop_words = set(stopwords.words('english')) 

result = []
for w in x_train: 
    if w not in stop_words: 
        result.append(w)
x_train = result
result = []
for w in x_test: 
    if w not in stop_words: 
        result.append(w)
x_test = result

In [28]:
dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(x_train)
x_test_dtm = dtmvector.transform(x_test)
y_train = np.array(y_train).astype("int")
y_test = np.array(y_test).astype("int")

In [29]:
model = MultinomialNB()
model.fit(x_train_dtm, y_train)

MultinomialNB()

In [30]:
predicted = model.predict(x_test_dtm) #테스트 데이터에 대한 예측
print("정확도: %.4f" % accuracy_score(y_test, predicted)) #예측값과 실제값 비교

정확도: 0.8614
