# 만들어진 Doc2Vec model을 통한 감정분석 실시
> * Positive or Negative

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os

os.environ['KERAS_BACKEND']='tensorflow'

import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import doc2vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17427044518938657962
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4969044377
locality {
  bus_id: 1
}
incarnation: 17760644295380746286
physical_device_desc: "device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:09:00.0, compute capability: 6.1"
]


In [3]:
def Make_Roc_Curve(x, y, model1, model2, model3):
    fpr1, tpr1, thresholds1 = roc_curve(y, model1.predict(x))
    fpr2, tpr2, thresholds2 = roc_curve(y, model2.predict(x))
    fpr3, tpr3, thresholds3 = roc_curve(y, model3.predict(x))
    plt.plot(fpr1, tpr1, label="Logistic Regression")
    plt.plot(fpr2, tpr2, label="RandomForest")
    plt.plot(fpr3, tpr3, label="Kernel SVM")
    plt.legend()
    plt.plot([0, 1], [0, 1], 'k--', label="random guess")
    plt.xlabel('False Positive Rate (Fall-Out)')
    plt.ylabel('True Positive Rate (Recall)')
    plt.title('Receiver operating characteristic example')
    plt.show()

In [4]:
def plot_history(history):
    """Plot model history after `fit()`.
    """

    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

In [5]:
import gensim
gensim.__version__

'3.3.0'

* windows에서 모델을 만들때 사용한 gensim의 버전이 3.3이었고, mac에서는 버전이 맞지 않아서 만들어둔 모델을 불러오지 못하였음.

## Doc2Vec

In [6]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags sentiment')

In [7]:
def Get_Infer_Vector(docs, model):
    return [model.infer_vector(doc.words) for doc in tqdm(docs)]

In [8]:
if sys.platform =='darwin':
    loadModelPath = '/Volumes/disk1/model/'
elif sys.platform =='win32':
    loadModelPath = 'd:/model/'
saveTrainPath = './data/pre_data/train_test_Data2/'
saveClassifierPath = './data/pre_data/classifier/'

In [9]:
cores = int(multiprocessing.cpu_count() )
print (cores)

12


### twitter

#### Load Model

In [10]:
model1 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-1000_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-ct.model')
model2 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-1000_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-ct.model')
model3 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-1000_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-ct.model')

#### Word to Vector

##### train set

In [11]:
train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_by_ct_train_for_doc2vec_sentiment_analysis.pickled','rb'))

In [12]:
name1 = '-'.join(re.split('[\(\),\/]',str(model1)))+'ct'
if not os.path.isfile(saveTrainPath+'train_x_'+name1):
    train_x_by_m1 = Get_Infer_Vector(train, model1)
    
    pickle.dump(train_x_by_m1,open(saveTrainPath+'train_x_'+name1,'wb'))
    del train_x_by_m1
    del name1
    
name2 = '-'.join(re.split('[\(\),\/]',str(model2)))+'ct'
if not os.path.isfile(saveTrainPath+'train_x_'+name2):
    train_x_by_m2 = Get_Infer_Vector(train, model2)

    pickle.dump(train_x_by_m2,open(saveTrainPath+'train_x_'+name2,'wb'))
    del train_x_by_m2
    del name2

name3 = '-'.join(re.split('[\(\),\/]',str(model3)))+'ct'
if not os.path.isfile(saveTrainPath+'train_x_'+name3):
    train_x_by_m3 = Get_Infer_Vector(train, model3)

    pickle.dump(train_x_by_m3,open(saveTrainPath+'train_x_'+name3,'wb'))
    del train_x_by_m3
    del name3

if not os.path.isfile(saveTrainPath+'train_senti_y_ct_for_sentiment_analysis'):
    train_senti_y = [doc.sentiment for doc in tqdm(train)]
    
    pickle.dump(train_senti_y, open(saveTrainPath+'train_senti_y_ct_for_sentiment_analysis','wb'))
    del train_senti_y
    
if not os.path.isfile(saveTrainPath+'train_tags_y_ct_for_sentiment_analysis'):
    train_tags_y = [doc.tags for doc in tqdm(train)]
    
    pickle.dump(train_tags_y, open(saveTrainPath+'train_tags_y_ct_for_sentiment_analysis','wb'))
    del train_tags_y

if 'train' in locals():
    del train

##### test set

In [13]:
test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_by_ct_test_for_doc2vec_sentiment_analysis.pickled','rb'))

In [14]:
name1 = '-'.join(re.split('[\(\),\/]',str(model1)))+'ct'
if not os.path.isfile(saveTrainPath+'test_x_'+name1):
    test_x_by_m1 = Get_Infer_Vector(test, model1)
    
    pickle.dump(test_x_by_m1,open(saveTrainPath+'test_x_'+name1,'wb'))
    del test_x_by_m1
    del name1
    
name2 = '-'.join(re.split('[\(\),\/]',str(model2)))+'ct'
if not os.path.isfile(saveTrainPath+'test_x_'+name2):
    test_x_by_m2 = Get_Infer_Vector(test, model2)

    pickle.dump(test_x_by_m2,open(saveTrainPath+'test_x_'+name2,'wb'))
    del test_x_by_m2
    del name2

name3 = '-'.join(re.split('[\(\),\/]',str(model3)))+'ct'
if not os.path.isfile(saveTrainPath+'test_x_'+name3):
    test_x_by_m3 = Get_Infer_Vector(test, model3)

    pickle.dump(test_x_by_m3,open(saveTrainPath+'test_x_'+name3,'wb'))
    del test_x_by_m3
    del name3

if not os.path.isfile(saveTrainPath+'test_senti_y_ct_for_sentiment_analysis'):
    test_senti_y = [doc.sentiment for doc in tqdm(test)]
    
    pickle.dump(test_senti_y, open(saveTrainPath+'test_senti_y_ct_for_sentiment_analysis','wb'))
    del test_senti_y
    
if not os.path.isfile(saveTrainPath+'test_tags_y_ct_for_sentiment_analysis'):
    test_tags_y = [doc.tags for doc in tqdm(test)]
    
    pickle.dump(test_tags_y, open(saveTrainPath+'test_tags_y_ct_for_sentiment_analysis','wb'))
    del test_tags_y

if 'test' in locals():
    del test

#### model1
* Doc2Vec(dm/m,d1000,n7,w10,mc5,s0.001,t12)
* size-1000
* epoch-20
* window-10
* negative-7
* hs-0
* dm-1
* dm_concat-0
* dm_mean-1

In [15]:
train_x_by_m1_name = 'train_x_Doc2Vec-dm-m-d1000-n7-w10-mc5-s0.001-t12-ct'
train_x_by_m1 = pickle.load(open(saveTrainPath+train_x_by_m1_name,'rb'))
train_y = pickle.load(open(saveTrainPath+'train_senti_y_ct_for_sentiment_analysis','rb'))

In [16]:
test_x_by_m1_name = 'test_x_Doc2Vec-dm-m-d1000-n7-w10-mc5-s0.001-t12-ct'
test_x_by_m1 = pickle.load(open(saveTrainPath+test_x_by_m1_name,'rb'))
test_y = pickle.load(open(saveTrainPath+'test_senti_y_ct_for_sentiment_analysis','rb'))

#### 분류 모델 : Logistic Regression

In [17]:
train_y2 = [y[0] for y in tqdm(train_y)]
test_y2 = [y[0] for y in tqdm(test_y)]

100%|██████████| 442359/442359 [00:00<00:00, 1903760.78it/s]
100%|██████████| 49151/49151 [00:00<00:00, 1691702.38it/s]


In [None]:
%%time
classifier = LogisticRegression(random_state=1234, n_jobs=cores)
classifier.fit(train_x_by_m1, train_y2)
print (classifier.get_params())
print( 'score : {}'.format(classifier.score(test_x_by_m1, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier.predict(test_x_by_m1)))

In [None]:
pickle.dump(classifier,open(saveClassifierPath+'LogisticRegression_'+train_x_by_m1_name, 'wb'))

#### 분류모델 : RandomForest Classifier

In [None]:
%%time
classifier2 = RandomForestClassifier(random_state=1234, n_jobs=cores)
classifier2.fit(train_x_by_m1, train_y2)
print (classifier2.get_params())
print( 'score : {}'.format(classifier2.score(test_x_by_m1, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier2.predict(test_x_by_m1)))

In [None]:
pickle.dump(classifier2,open(saveClassifierPath+'RandomForestClassifier_'+train_x_by_m1_name, 'wb'))

#### 분류모델 : C-Support Vector Classification

In [None]:
%%time
scaling = MinMaxScaler(feature_range=(-1, 1)).fit(train_x_by_m1)
train_x_by_m1_2 = scaling.transform(train_x_by_m1)
test_x_by_m1_2 = scaling.transform(test_x_by_m1)
classifier3 =  SVC(kernel = 'linear', 
        cache_size= 10000) 
classifier3.fit(train_x_by_m1_2, train_y2)
print (classifier3.get_params())
print( 'score : {}'.format(classifier3.score(test_x_by_m1_2, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier3.predict(test_x_by_m1_2)))

In [None]:
del scaling
del train_x_by_m1_2
del test_x_by_m1_2

In [None]:
pickle.dump(classifier3,open(saveClassifierPath+'SVC_'+train_x_by_m1_name, 'wb'))

In [None]:
Make_Roc_Curve(test_x_by_m1, test_y, classifier, classifier2, classifier3)

In [None]:
del classifier
del classifier2
del classifier3

#### 분류모델 :  Neural Network

In [18]:
train_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x,train_x_by_m1))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x, test_x_by_m1))])
test_vecs_w2v = scale(test_vecs_w2v)

y_test = np.array(test_y)
y_train = np.array(train_y)

442359it [00:00, 704456.41it/s]
49151it [00:00, 743592.17it/s]


In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=1000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0,
          validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_1_'+train_x_by_m1_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=1000))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(#optimizer='rmsprop',
    optimizer='adadelta',
              #loss='binary_crossentropy',
    #optimizer=SGD(lr=0.2), 
    loss='binary_crossentropy',
    metrics=['accuracy'])

history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_2_'+train_x_by_m1_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(256, activation='relu', input_dim=1000))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.125))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train model
history = model.fit(train_vecs_w2v, y_train,epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)] )
# Evaluate model
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)

print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_3_'+train_x_by_m1_name)

In [None]:
plot_history(history)

In [None]:
train_vecs_w2v_2 = train_vecs_w2v.reshape(train_vecs_w2v.shape[0], train_vecs_w2v.shape[1], 1)
test_vecs_w2v_2 = test_vecs_w2v.reshape(test_vecs_w2v.shape[0], test_vecs_w2v.shape[1], 1)

print (train_vecs_w2v_2.shape, test_vecs_w2v_2.shape)

In [None]:
model = Sequential()
model.add(Conv1D(filters = 5, kernel_size = 3,
                 activation='relu', input_shape = [1000, 1]
                ))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Flatten())

# We add a vanilla hidden layer:
model.add(Dense(10))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(train_vecs_w2v_2, y_train,
                    
                    epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v_2, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_4_'+train_x_by_m1_name)

In [None]:
plot_history(history)

In [None]:
del train_x_by_m1
del train_x_by_m1_name
del test_x_by_m1
del test_x_by_m1_name

#### model2
* Doc2Vec(dm/c,d1000,n7,w5,mc5,s0.001,t12)
* size-1000
* epoch-20
* window-5
* negative-7
* hs-0
* dm-1
* dm_concat-1
* dm_mean-0

In [None]:
train_x_by_m2_name = 'train_x_Doc2Vec-dm-c-d1000-n7-w5-mc5-s0.001-t12-ct'
train_x_by_m2 = pickle.load(open(saveTrainPath+train_x_by_m2_name,'rb'))
train_y = pickle.load(open(saveTrainPath+'train_senti_y_ct_for_sentiment_analysis','rb'))

In [None]:
test_x_by_m2_name = 'test_x_Doc2Vec-dm-c-d1000-n7-w5-mc5-s0.001-t12-ct'
test_x_by_m2 = pickle.load(open(saveTrainPath+test_x_by_m2_name,'rb'))
test_y = pickle.load(open(saveTrainPath+'test_senti_y_ct_for_sentiment_analysis','rb'))

In [None]:
train_y2 = [y[0] for y in tqdm(train_y)]
test_y2 = [y[0] for y in tqdm(test_y)]

#### 분류 모델 : Logistic Regression

In [None]:
%%time
classifier = LogisticRegression(random_state=1234)
classifier.fit(train_x_by_m2, train_y2)
print (classifier.get_params())
print( 'score : {}'.format(classifier.score(test_x_by_m2, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier.predict(test_x_by_m2)))

In [None]:
pickle.dump(classifier,open(saveClassifierPath+'LogisticRegression_'+train_x_by_m2_name, 'wb'))

#### 분류모델 : RandomForest Classifier

In [None]:
%%time
classifier2 = RandomForestClassifier(random_state=1234)
classifier2.fit(train_x_by_m2, train_y2)
print (classifier2.get_params())
print( 'score : {}'.format(classifier2.score(test_x_by_m2, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier2.predict(test_x_by_m2)))

In [None]:
pickle.dump(classifier2,open(saveClassifierPath+'RandomForestClassifier_'+train_x_by_m2_name, 'wb'))

#### 분류모델 : C-Support Vector Classification

In [None]:
%%time
scaling = MinMaxScaler(feature_range=(-1, 1)).fit(train_x_by_m2)
train_x_by_m2_2 = scaling.transform(train_x_by_m2)
test_x_by_m2_2 = scaling.transform(test_x_by_m2)
classifier3 =  SVC(kernel = 'linear', 
        cache_size= 10000) 
classifier3.fit(train_x_by_m2_2, train_y2)
print (classifier3.get_params())
print( 'score : {}'.format(classifier3.score(test_x_by_m2_2, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier3.predict(test_x_by_m2_2)))

In [None]:
del scaling
del train_x_by_m2_2
del test_x_by_m2_2

In [None]:
pickle.dump(classifier3,open(saveClassifierPath+'SVC_'+train_x_by_m2_name, 'wb'))

In [None]:
Make_Roc_Curve(test_x_by_m2, test_y, classifier, classifier2, classifier3)

In [None]:
del classifier
del classifier2
del classifier3

#### 분류모델 : Neural Network

In [None]:
train_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x,train_x_by_m2))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x, test_x_by_m2))])
test_vecs_w2v = scale(test_vecs_w2v)

y_test = np.array(test_y)
y_train = np.array(train_y)

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=1000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0,
          validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_1_'+train_x_by_m2_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=1000))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(#optimizer='rmsprop',
    optimizer='adadelta',
              #loss='binary_crossentropy',
    #optimizer=SGD(lr=0.2), 
    loss='binary_crossentropy',
    metrics=['accuracy'])

history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_2_'+train_x_by_m2_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=1000))
model.add(Dropout(0.25))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.125))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train model
history = model.fit(train_vecs_w2v, y_train,epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)] )
# Evaluate model
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)

print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_3_'+train_x_by_m2_name)

In [None]:
plot_history(history)

In [None]:
train_vecs_w2v_2 = train_vecs_w2v.reshape(train_vecs_w2v.shape[0], train_vecs_w2v.shape[1], 1)
test_vecs_w2v_2 = test_vecs_w2v.reshape(test_vecs_w2v.shape[0], test_vecs_w2v.shape[1], 1)

print (train_vecs_w2v_2.shape, test_vecs_w2v_2.shape)

In [None]:
model = Sequential()
model.add(Conv1D(filters = 5, kernel_size = 3,
                 activation='relu', input_shape = [1000, 1]
                ))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Flatten())

# We add a vanilla hidden layer:
model.add(Dense(10))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(train_vecs_w2v_2, y_train,
                    
                    epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v_2, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_4_'+train_x_by_m2_name)

In [None]:
plot_history(history)

In [None]:
del train_x_by_m2
del train_x_by_m2_name
del test_x_by_m2
del test_x_by_m2_name

#### model3
* Doc2Vec(dbow,d1000,n7,mc5,s0.001,t12)
* size-1000
* epoch-20
* window-None
* negative-7
* hs-0
* dm-0
* dm_concat-0
* dm_mean-0

In [None]:
train_x_by_m3_name = 'train_x_Doc2Vec-dbow-d1000-n7-mc5-s0.001-t12-ct'
train_x_by_m3 = pickle.load(open(saveTrainPath+train_x_by_m3_name,'rb'))
train_y = pickle.load(open(saveTrainPath+'train_senti_y_ct_for_sentiment_analysis','rb'))

In [None]:
test_x_by_m3_name = 'test_x_Doc2Vec-dbow-d1000-n7-mc5-s0.001-t12-ct'
test_x_by_m3 = pickle.load(open(saveTrainPath+test_x_by_m3_name,'rb'))
test_y = pickle.load(open(saveTrainPath+'test_senti_y_ct_for_sentiment_analysis','rb'))

In [None]:
train_y2 = [y[0] for y in tqdm(train_y)]
test_y2 = [y[0] for y in tqdm(test_y)]

#### 분류 모델 : Logistic Regression

In [None]:
%%time
classifier = LogisticRegression(random_state=1234)
classifier.fit(train_x_by_m3, train_y2)
print (classifier.get_params())
print( 'score : {}'.format(classifier.score(test_x_by_m3, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier.predict(test_x_by_m3)))

In [None]:
pickle.dump(classifier,open(saveClassifierPath+'LogisticRegression_'+train_x_by_m3_name, 'wb'))

#### 분류모델 : RandomForest Classifier

In [None]:
%%time
classifier2 = RandomForestClassifier(random_state=1234)
classifier2.fit(train_x_by_m3, train_y2)
print (classifier2.get_params())
print( 'score : {}'.format(classifier2.score(test_x_by_m3, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier2.predict(test_x_by_m3)))

In [None]:
pickle.dump(classifier2,open(saveClassifierPath+'RandomForestClassifier_'+train_x_by_m3_name, 'wb'))

#### 분류모델 : C-Support Vector Classification

In [None]:
%%time
scaling = MinMaxScaler(feature_range=(-1, 1)).fit(train_x_by_m3)
train_x_by_m3_2 = scaling.transform(train_x_by_m3)
test_x_by_m3_2 = scaling.transform(test_x_by_m3)
classifier3 =  SVC(kernel = 'linear', 
        cache_size= 10000) 
classifier3.fit(train_x_by_m3_2, train_y2)
print (classifier3.get_params())
print( 'score : {}'.format(classifier3.score(test_x_by_m3_2, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier3.predict(test_x_by_m3_2)))

In [None]:
del scaling
del train_x_by_m3_2
del test_x_by_m3_2

In [None]:
pickle.dump(classifier3,open(saveClassifierPath+'SVC_'+train_x_by_m3_name, 'wb'))

In [None]:
Make_Roc_Curve(test_x_by_m3, test_y, classifier, classifier2, classifier3)

In [None]:
del classifier
del classifier2
del classifier3

#### 분류모델 : Neural Network

In [None]:
train_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x,train_x_by_m3))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x, test_x_by_m3))])
test_vecs_w2v = scale(test_vecs_w2v)

y_test = np.array(test_y)
y_train = np.array(train_y)

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=1000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0,
          validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_1_'+train_x_by_m3_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=1000))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(#optimizer='rmsprop',
    optimizer='adadelta',
              #loss='binary_crossentropy',
    #optimizer=SGD(lr=0.2), 
    loss='binary_crossentropy',
    metrics=['accuracy'])

history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_2_'+train_x_by_m3_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=1000))
model.add(Dropout(0.25))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.125))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train model
history = model.fit(train_vecs_w2v, y_train,epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)] )
# Evaluate model
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)

print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_3_'+train_x_by_m3_name)

In [None]:
plot_history(history)

In [None]:
train_vecs_w2v_2 = train_vecs_w2v.reshape(train_vecs_w2v.shape[0], train_vecs_w2v.shape[1], 1)
test_vecs_w2v_2 = test_vecs_w2v.reshape(test_vecs_w2v.shape[0], test_vecs_w2v.shape[1], 1)

print (train_vecs_w2v_2.shape, test_vecs_w2v_2.shape)

In [None]:
model = Sequential()
model.add(Conv1D(filters = 5, kernel_size = 3,
                 activation='relu', input_shape = [1000, 1]
                ))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Flatten())

# We add a vanilla hidden layer:
model.add(Dense(10))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(train_vecs_w2v_2, y_train,
                    
                    epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v_2, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_4_'+train_x_by_m3_name)

In [None]:
plot_history(history)

In [None]:
del train_x_by_m3
del train_x_by_m3_name
del test_x_by_m3
del test_x_by_m3_name

#### mecab

#### Load Model

In [None]:
model1 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-1000_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-mecab.model')
model2 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-1000_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-mecab.model')
model3 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-1000_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-mecab.model')

#### Word to Vector

##### train set

In [None]:
train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_by_mecab_train_for_doc2vec_sentiment_analysis','rb'))

In [None]:
name1 = '-'.join(re.split('[\(\),\/]',str(model1)))+'mecab'
if not os.path.isfile(saveTrainPath+'train_x_'+name1):
    train_x_by_m1 = Get_Infer_Vector(train, model1)
    
    pickle.dump(train_x_by_m1,open(saveTrainPath+'train_x_'+name1,'wb'))
    del train_x_by_m1
    del name1
    
name2 = '-'.join(re.split('[\(\),\/]',str(model2)))+'mecab'
if not os.path.isfile(saveTrainPath+'train_x_'+name2):
    train_x_by_m2 = Get_Infer_Vector(train, model2)

    pickle.dump(train_x_by_m2,open(saveTrainPath+'train_x_'+name2,'wb'))
    del train_x_by_m2
    del name2

name3 = '-'.join(re.split('[\(\),\/]',str(model3)))+'mecab'
if not os.path.isfile(saveTrainPath+'train_x_'+name3):
    train_x_by_m3 = Get_Infer_Vector(train, model3)

    pickle.dump(train_x_by_m3,open(saveTrainPath+'train_x_'+name3,'wb'))
    del train_x_by_m3
    del name3

if not os.path.isfile(saveTrainPath+'train_senti_y_mecab_for_sentiment_analysis'):
    train_senti_y = [doc.sentiment for doc in tqdm(train)]
    
    pickle.dump(train_senti_y, open(saveTrainPath+'train_senti_y_mecab_for_sentiment_analysis','wb'))
    del train_senti_y
    
if not os.path.isfile(saveTrainPath+'train_tags_y_mecab_for_sentiment_analysis'):
    train_tags_y = [doc.tags for doc in tqdm(train)]
    
    pickle.dump(train_tags_y, open(saveTrainPath+'train_tags_y_mecab_for_sentiment_analysis','wb'))
    del train_tags_y

if 'train' in locals():
    del train

##### test set

In [None]:
test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_by_mecab_test_for_doc2vec_sentiment_analysis','rb'))

In [None]:
name1 = '-'.join(re.split('[\(\),\/]',str(model1)))+'mecab'
if not os.path.isfile(saveTrainPath+'test_x_'+name1):
    test_x_by_m1 = Get_Infer_Vector(test, model1)
    
    pickle.dump(test_x_by_m1,open(saveTrainPath+'test_x_'+name1,'wb'))
    del test_x_by_m1
    del name1
    
name2 = '-'.join(re.split('[\(\),\/]',str(model2)))+'mecab'
if not os.path.isfile(saveTrainPath+'test_x_'+name2):
    test_x_by_m2 = Get_Infer_Vector(test, model2)

    pickle.dump(test_x_by_m2,open(saveTrainPath+'test_x_'+name2,'wb'))
    del test_x_by_m2
    del name2

name3 = '-'.join(re.split('[\(\),\/]',str(model3)))+'mecab'
if not os.path.isfile(saveTrainPath+'test_x_'+name3):
    test_x_by_m3 = Get_Infer_Vector(test, model3)

    pickle.dump(test_x_by_m3,open(saveTrainPath+'test_x_'+name3,'wb'))
    del test_x_by_m3
    del name3

if not os.path.isfile(saveTrainPath+'test_senti_y_mecab_for_sentiment_analysis'):
    test_senti_y = [doc.sentiment for doc in tqdm(test)]
    
    pickle.dump(test_senti_y, open(saveTrainPath+'test_senti_y_mecab_for_sentiment_analysis','wb'))
    del test_senti_y
    
if not os.path.isfile(saveTrainPath+'test_tags_y_mecab_for_sentiment_analysis'):
    test_tags_y = [doc.tags for doc in tqdm(test)]
    
    pickle.dump(test_tags_y, open(saveTrainPath+'test_tags_y_mecab_for_sentiment_analysis','wb'))
    del test_tags_y

if 'test' in locals():
    del test

#### model1
* Doc2Vec(dm/m,d1000,n7,w10,mc5,s0.001,t12)
* size-1000
* epoch-20
* window-10
* negative-7
* hs-0
* dm-1
* dm_concat-0
* dm_mean-1

In [None]:
train_x_by_m1_name = 'train_x_Doc2Vec-dm-m-d1000-n7-w10-mc5-s0.001-t12-mecab'
train_x_by_m1 = pickle.load(open(saveTrainPath+train_x_by_m1_name,'rb'))
train_y = pickle.load(open(saveTrainPath+'train_senti_y_mecab_for_sentiment_analysis','rb'))

In [None]:
test_x_by_m1_name = 'test_x_Doc2Vec-dm-m-d1000-n7-w10-mc5-s0.001-t12-mecab'
test_x_by_m1 = pickle.load(open(saveTrainPath+test_x_by_m1_name,'rb'))
test_y = pickle.load(open(saveTrainPath+'test_senti_y_mecab_for_sentiment_analysis','rb'))

#### 분류 모델 : Logistic Regression

In [None]:
train_y2 = [y[0] for y in tqdm(train_y)]
test_y2 = [y[0] for y in tqdm(test_y)]

In [None]:
%%time
classifier = LogisticRegression(random_state=1234)
classifier.fit(train_x_by_m1, train_y2)
print (classifier.get_params())
print( 'score : {}'.format(classifier.score(test_x_by_m1, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier.predict(test_x_by_m1)))

In [None]:
pickle.dump(classifier,open(saveClassifierPath+'LogisticRegression_'+train_x_by_m1_name, 'wb'))

#### 분류모델 : RandomForest Classifier

In [None]:
%%time
classifier2 = RandomForestClassifier(random_state=1234)
classifier2.fit(train_x_by_m1, train_y2)
print (classifier2.get_params())
print( 'score : {}'.format(classifier2.score(test_x_by_m1, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier2.predict(test_x_by_m1)))

In [None]:
pickle.dump(classifier2,open(saveClassifierPath+'RandomForestClassifier_'+train_x_by_m1_name, 'wb'))

#### 분류모델 : C-Support Vector Classification

In [None]:
%%time
scaling = MinMaxScaler(feature_range=(-1, 1)).fit(train_x_by_m1)
train_x_by_m1_2 = scaling.transform(train_x_by_m1)
test_x_by_m1_2 = scaling.transform(test_x_by_m1)
classifier3 =  SVC(kernel = 'linear', 
        cache_size= 10000) 
classifier3.fit(train_x_by_m1_2, train_y2)
print (classifier3.get_params())
print( 'score : {}'.format(classifier3.score(test_x_by_m1_2, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier3.predict(test_x_by_m1_2)))

In [None]:
del scaling
del train_x_by_m1_2
del test_x_by_m1_2

In [None]:
pickle.dump(classifier3,open(saveClassifierPath+'SVC_'+train_x_by_m1_name, 'wb'))

In [None]:
Make_Roc_Curve(test_x_by_m1, test_y, classifier, classifier2, classifier3)

In [None]:
del classifier
del classifier2
del classifier3

#### 분류모델 : Neural Network

In [None]:
train_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x,train_x_by_m1))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x, test_x_by_m1))])
test_vecs_w2v = scale(test_vecs_w2v)

y_test = np.array(test_y)
y_train = np.array(train_y)

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=1000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0,
          validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_1_'+train_x_by_m1_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=1000))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(#optimizer='rmsprop',
    optimizer='adadelta',
              #loss='binary_crossentropy',
    #optimizer=SGD(lr=0.2), 
    loss='binary_crossentropy',
    metrics=['accuracy'])

history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_2_'+train_x_by_m1_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=1000))
model.add(Dropout(0.25))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.125))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train model
history = model.fit(train_vecs_w2v, y_train,epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)] )
# Evaluate model
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)

print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_3_'+train_x_by_m1_name)

In [None]:
plot_history(history)

In [None]:
train_vecs_w2v_2 = train_vecs_w2v.reshape(train_vecs_w2v.shape[0], train_vecs_w2v.shape[1], 1)
test_vecs_w2v_2 = test_vecs_w2v.reshape(test_vecs_w2v.shape[0], test_vecs_w2v.shape[1], 1)

print (train_vecs_w2v_2.shape, test_vecs_w2v_2.shape)

In [None]:
model = Sequential()
model.add(Conv1D(filters = 5, kernel_size = 3,
                 activation='relu', input_shape = [1000, 1]
                ))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Flatten())

# We add a vanilla hidden layer:
model.add(Dense(10))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(train_vecs_w2v_2, y_train,
                    
                    epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v_2, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_4_'+train_x_by_m1_name)

In [None]:
plot_history(history)

In [None]:
del train_x_by_m1
del train_x_by_m1_name
del test_x_by_m1
del test_x_by_m1_name

#### model2
* Doc2Vec(dm/c,d1000,n7,w5,mc5,s0.001,t12)
* size-1000
* epoch-20
* window-5
* negative-7
* hs-0
* dm-1
* dm_concat-1
* dm_mean-0

In [None]:
train_x_by_m2_name = 'train_x_Doc2Vec-dm-c-d1000-n7-w5-mc5-s0.001-t12-mecab'
train_x_by_m2 = pickle.load(open(saveTrainPath+train_x_by_m2_name,'rb'))
train_y = pickle.load(open(saveTrainPath+'train_senti_y_mecab_for_sentiment_analysis','rb'))

In [None]:
test_x_by_m2_name = 'test_x_Doc2Vec-dm-c-d1000-n7-w5-mc5-s0.001-t12-mecab'
test_x_by_m2 = pickle.load(open(saveTrainPath+test_x_by_m2_name,'rb'))
test_y = pickle.load(open(saveTrainPath+'test_senti_y_mecab_for_sentiment_analysis','rb'))

In [None]:
train_y2 = [y[0] for y in tqdm(train_y)]
test_y2 = [y[0] for y in tqdm(test_y)]

#### 분류 모델 : Logistic Regression

In [None]:
%%time
classifier = LogisticRegression(random_state=1234)
classifier.fit(train_x_by_m2, train_y2)
print (classifier.get_params())
print( 'score : {}'.format(classifier.score(test_x_by_m2, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier.predict(test_x_by_m2)))

In [None]:
pickle.dump(classifier,open(saveClassifierPath+'LogisticRegression_'+train_x_by_m2_name, 'wb'))

#### 분류모델 : RandomForest Classifier

In [None]:
%%time
classifier2 = RandomForestClassifier(random_state=1234)
classifier2.fit(train_x_by_m2, train_y2)
print (classifier2.get_params())
print( 'score : {}'.format(classifier2.score(test_x_by_m2, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier2.predict(test_x_by_m2)))

In [None]:
pickle.dump(classifier2,open(saveClassifierPath+'RandomForestClassifier_'+train_x_by_m2_name, 'wb'))

#### 분류모델 : C-Support Vector Classification

In [None]:
%%time
scaling = MinMaxScaler(feature_range=(-1, 1)).fit(train_x_by_m2)
train_x_by_m2_2 = scaling.transform(train_x_by_m2)
test_x_by_m2_2 = scaling.transform(test_x_by_m2)
classifier3 =  SVC(kernel = 'linear', 
        cache_size= 10000) 
classifier3.fit(train_x_by_m2_2, train_y2)
print (classifier3.get_params())
print( 'score : {}'.format(classifier3.score(test_x_by_m2_2, test_y2)))
print ('classification report')
print (classification_report(test_y2, classifier3.predict(test_x_by_m2_2)))

In [None]:
del scaling
del train_x_by_m2_2
del test_x_by_m2_2

In [None]:
pickle.dump(classifier3,open(saveClassifierPath+'SVC_'+train_x_by_m2_name, 'wb'))

In [None]:
Make_Roc_Curve(test_x_by_m2, test_y, classifier, classifier2, classifier3)

In [None]:
del classifier
del classifier2
del classifier3

#### 분류모델 : Neural Network

In [None]:
train_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x,train_x_by_m2))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x, test_x_by_m2))])
test_vecs_w2v = scale(test_vecs_w2v)

y_test = np.array(test_y)
y_train = np.array(train_y)

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=1000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0,
          validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_1_'+train_x_by_m2_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=1000))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(#optimizer='rmsprop',
    optimizer='adadelta',
              #loss='binary_crossentropy',
    #optimizer=SGD(lr=0.2), 
    loss='binary_crossentropy',
    metrics=['accuracy'])

history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_2_'+train_x_by_m2_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=1000))
model.add(Dropout(0.25))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.125))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train model
history = model.fit(train_vecs_w2v, y_train,epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)] )
# Evaluate model
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)

print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_3_'+train_x_by_m2_name)

In [None]:
plot_history(history)

In [None]:
train_vecs_w2v_2 = train_vecs_w2v.reshape(train_vecs_w2v.shape[0], train_vecs_w2v.shape[1], 1)
test_vecs_w2v_2 = test_vecs_w2v.reshape(test_vecs_w2v.shape[0], test_vecs_w2v.shape[1], 1)

print (train_vecs_w2v_2.shape, test_vecs_w2v_2.shape)

In [None]:
model = Sequential()
model.add(Conv1D(filters = 5, kernel_size = 3,
                 activation='relu', input_shape = [1000, 1]
                ))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Flatten())

# We add a vanilla hidden layer:
model.add(Dense(10))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(train_vecs_w2v_2, y_train,
                    
                    epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v_2, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_4_'+train_x_by_m2_name)

In [None]:
plot_history(history)

In [None]:
del train_x_by_m2
del train_x_by_m2_name
del test_x_by_m2
del test_x_by_m2_name


#### model3
* Doc2Vec(dbow,d1000,n7,mc5,s0.001,t12)
* size-1000
* epoch-20
* window-None
* negative-7
* hs-0
* dm-0
* dm_concat-0
* dm_mean-0

In [None]:
train_x_by_m3_name = 'train_x_Doc2Vec-dbow-d1000-n7-mc5-s0.001-t12-mecab'
train_x_by_m3 = pickle.load(open(saveTrainPath+train_x_by_m3_name,'rb'))
train_y = pickle.load(open(saveTrainPath+'train_senti_y_mecab_for_sentiment_analysis','rb'))

In [None]:
test_x_by_m3_name = 'test_x_Doc2Vec-dbow-d1000-n7-mc5-s0.001-t12-mecab'
test_x_by_m3 = pickle.load(open(saveTrainPath+test_x_by_m3_name,'rb'))
test_y = pickle.load(open(saveTrainPath+'test_senti_y_mecab_for_sentiment_analysis','rb'))

#### 분류 모델 : Logistic Regression

In [None]:
%%time
classifier = LogisticRegression(random_state=1234)
classifier.fit(train_x_by_m3, train_y)
print (classifier.get_params())
print( 'score : {}'.format(classifier.score(test_x_by_m3, test_y)))
print ('classification report')
print (classification_report(test_y, classifier.predict(test_x_by_m3)))

In [None]:
pickle.dump(classifier,open(saveClassifierPath+'LogisticRegression_'+train_x_by_m3_name, 'wb'))

#### 분류모델 : RandomForest Classifier

In [None]:
%%time
classifier2 = RandomForestClassifier(random_state=1234)
classifier2.fit(train_x_by_m3, train_y)
print (classifier2.get_params())
print( 'score : {}'.format(classifier2.score(test_x_by_m3, test_y)))
print ('classification report')
print (classification_report(test_y, classifier2.predict(test_x_by_m3)))

In [None]:
pickle.dump(classifier2,open(saveClassifierPath+'RandomForestClassifier_'+train_x_by_m3_name, 'wb'))

#### 분류모델 : C-Support Vector Classification

In [None]:
%%time
scaling = MinMaxScaler(feature_range=(-1, 1)).fit(train_x_by_m3)
train_x_by_m3_2 = scaling.transform(train_x_by_m3)
test_x_by_m3_2 = scaling.transform(test_x_by_m3)
classifier3 =  SVC(kernel = 'linear', 
        cache_size= 10000) 
classifier3.fit(train_x_by_m3_2, train_y)
print (classifier3.get_params())
print( 'score : {}'.format(classifier3.score(test_x_by_m3_2, test_y)))
print ('classification report')
print (classification_report(test_y, classifier3.predict(test_x_by_m3_2)))

In [None]:
del scaling
del train_x_by_m3_2
del test_x_by_m3_2

In [None]:
pickle.dump(classifier3,open(saveClassifierPath+'SVC_'+train_x_by_m3_name, 'wb'))

In [None]:
Make_Roc_Curve(test_x_by_m3, test_y, classifier, classifier2, classifier3)

In [None]:
del classifier
del classifier2
del classifier3

#### 분류모델 : Neural Network

In [None]:
train_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x,train_x_by_m3))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([z.reshape(1,-1) for z in tqdm(map(lambda x: x, test_x_by_m3))])
test_vecs_w2v = scale(test_vecs_w2v)

y_test = np.array(test_y)
y_train = np.array(train_y)

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=1000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(train_vecs_w2v, y_train, epochs=200, batch_size = 250000,  verbose=0,
          validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_4_'+train_x_by_m3_name)

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=1000))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(#optimizer='rmsprop',
    optimizer='adadelta',
              #loss='binary_crossentropy',
    #optimizer=SGD(lr=0.2), 
    loss='binary_crossentropy',
    metrics=['accuracy'])

history = model.fit(train_vecs_w2v, y_train,epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_4_'+train_x_by_m3_name)

In [None]:
plot_history(history)

In [None]:
mode2 = Sequential()
mode2.add(Dense(64, activation='relu', input_dim=1000))
mode2.add(Dropout(0.25))
mode2.add(Dense(32, activation='relu'))
mode2.add(Dropout(0.125))
mode2.add(Dense(1, activation='softmax'))
mode2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train model
history = mode2.fit(train_vecs_w2v, y_train,epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)] )
# Evaluate model
score, acc = mode2.evaluate(test_vecs_w2v, y_test, verbose=0)

print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_4_'+train_x_by_m3_name)

In [None]:
plot_history(history)

In [None]:
train_vecs_w2v_2 = train_vecs_w2v.reshape(train_vecs_w2v.shape[0], train_vecs_w2v.shape[1], 1)
test_vecs_w2v_2 = test_vecs_w2v.reshape(test_vecs_w2v.shape[0], test_vecs_w2v.shape[1], 1)

print (train_vecs_w2v_2.shape, test_vecs_w2v_2.shape)

In [None]:
model = Sequential()
model.add(Conv1D(filters = 5, kernel_size = 3,
                 activation='relu', input_shape = [1000, 1]
                ))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=5, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Flatten())

# We add a vanilla hidden layer:
model.add(Dense(10))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(train_vecs_w2v_2, y_train,
                    
                    epochs=200, batch_size = 250000,  verbose=0, validation_split=0.2,
          callbacks=[TQDMNotebookCallback(show_inner=False)])
score, acc = model.evaluate(test_vecs_w2v_2, y_test, verbose=0)
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

In [None]:
model.save(saveClassifierPath+'NeuralNetwork_4_'+train_x_by_m3_name)

In [None]:
plot_history(history)

In [None]:
del train_x_by_m3
del train_x_by_m3_name
del test_x_by_m3
del test_x_by_m3_name