# CNN

In [2]:
import os
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import pickle
import pandas as pd


from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

from tensorflow.keras.models import load_model
from tensorflow.keras.activations import relu
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Input,
    Dense,
    Embedding,
    Flatten,
    Conv1D,
    MaxPooling1D,
    Add,
    Lambda,
    Dropout,
    concatenate,
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.backend import l2_normalize
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from IPython.display import SVG

In [None]:
def ModiData(obj_df):
    obj_df = obj_df[obj_df["label"] != 2]
    # obj_df["label"] = obj_df["label"].astype(string)
    obj_df.label = obj_df.label.replace(0, "neg")
    obj_df.label = obj_df.label.replace(1, "neg")
    obj_df.label = obj_df.label.replace(3, "pos")
    obj_df.label = obj_df.label.replace(4, "pos")
    obj_df = obj_df.fillna(0)
    # obj_df["label"] = obj_df["label"].astype(int)
    return obj_df


def clean_line(line):
    # split into tokens by white space
    tokens = line.split()
    # remove punctuation from each token
    table = str.maketrans("", "", string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words("english"))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens


def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding="post")
    return padded

In [3]:
train_df = pd.read_csv(
    "/content/drive/MyDrive/Sentiment Analysis/IMDB/train.csv", 
)

test_df = pd.read_csv(
    "/content/drive/MyDrive/Sentiment Analysis/IMDB/test.csv", 
)

train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [None]:
# train_df = ModiData(train_df)
# # dev_df = ModiData(dev_df)
# test_df = ModiData(test_df)

train_x = train_df.text.tolist()
train_y = train_df.label
# dev_x = dev_df.text.tolist()
# dev_y = dev_df.label
test_x = test_df.text.tolist()
test_y = test_df.label

max_len = 200
# tokenizer = create_tokenizer(train_x)
with open("/content/drive/MyDrive/Sentiment Analysis/tokenizer/200in_IMDB.pickle", "rb") as handle:
    tokenizer = pickle.load(handle)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
train_x = encode_text(tokenizer, train_x, max_len)
# dev_x = encode_text(tokenizer, dev_x, max_len)
test_x = encode_text(tokenizer, test_x, max_len)

le = LabelEncoder()
train_y = le.fit_transform(train_y).reshape(-1, 1)
# dev_y = le.transform(dev_y).reshape(-1, 1)
test_y = le.transform(test_y).reshape(-1, 1)

## Model

In [None]:
def get_embeddings_layer(name, max_len, trainable):
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=50,
        input_length=max_len,
        # weights=[embeddings_matrix],
        trainable=trainable,
        name=name,
    )
    return embedding_layer


def get_conv_pool(x_input, max_len, sufix, n_grams=[3, 4, 5], feature_maps=100):
    branches = []
    for n in n_grams:
        branch = Conv1D(
            filters=feature_maps,
            kernel_size=n,
            activation=relu,
            name="Conv_" + sufix + "_" + str(n),
        )(x_input)
        branch = MaxPooling1D(
            pool_size=max_len - n + 1,
            strides=None,
            padding="valid",
            name="MaxPooling_" + sufix + "_" + str(n),
        )(branch)
        branch = Flatten(name="Flatten_" + sufix + "_" + str(n))(branch)
        branches.append(branch)
    return branches


def get_cnn_rand(max_len):
    embeddings_layer_channel_1 = get_embeddings_layer(
        "embeddings_layer_dynamic", max_len, trainable=True
    )

    input_dynamic = Input(shape=(max_len,), dtype="int32", name="input_dynamic")
    x = embeddings_layer_channel_1(input_dynamic)
    branches_dynamic = get_conv_pool(x, max_len, "dynamic")
    z_dynamic = concatenate(branches_dynamic, axis=-1)

    z = Dropout(0.1)(z_dynamic)
    z = layer = Dense(20, activation="relu", name="FC1")(z)
    # z = Dropout(0.1)(z)
    o = Dense(1, activation="sigmoid", name="output")(z)

    model = Model(inputs=input_dynamic, outputs=o)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])

    return model

In [None]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get("val_loss") < 0.28:
            print("\nReached 0.27 loss so cancelling training!")
            self.model.stop_training = True


beststop = myCallback()
earlystop = EarlyStopping(monitor="val_loss", min_delta=0.0001)
checkpointer = ModelCheckpoint(filepath="./weights.hdf5", verbose=1)

In [None]:
def TrainModel(num):
    while num > 0:
        model = get_cnn_rand(max_len)
        # model.summary()
        history = model.fit(
            x=train_x,
            y=train_y,
            batch_size=100,
            epochs=20,
            validation_data=(test_x, test_y),
            callbacks=[earlystop, beststop],
        )
        if history.history["val_acc"][-1] > 0.885:
            save_name = (
                "/content/drive/MyDrive/Sentiment Analysis/Model/CNN_IMDB/"
                + str(history.history["val_loss"][-1])[2:6]
                + "_"
                + str(history.history["val_acc"][-1])[2:6]
                + ".hdf5"
            )
            num -= 1
            model.save(save_name)

In [None]:
TrainModel(10)

Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 1/20
Epoch 2/20

Reached 0.27 loss so cancelling training!
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 1/20
Epoch 2/20


In [None]:
# del model
model = get_cnn_rand(max_len)
# model.summary()

In [None]:
history = model.fit(
            x=train_x,
            y=train_y,
            batch_size=100,
            epochs=20,
            validation_data=(test_x, test_y),
            # callbacks=[earlystop],
        )

In [None]:
from tensorflow.keras.models import load_model

# 保存模型
# model.save("10in_32unit_temp_res.h5")
# del model  # deletes the existing model
# 导入已经训练好的模型
# model = load_model("my_model.h5")
## 保存训练好的Tokenizer，和导入
import pickle

# saving
with open("/content/drive/MyDrive/Sentiment Analysis/tokenizer/200in_IMDB.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
loss, accuracy = model.evaluate(x=[test_x, test_x], y=test_y, verbose=0)
print(loss)
print(accuracy)

### google

In [None]:
def load_google():
    Word2VecModel = gensim.models.KeyedVectors.load_word2vec_format(
        r"E:\CS\MLT\GoogleNews-vectors-negative300.bin", binary=True
    )

    vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]  # 存储 所有的 词语

    # word_index = {" ": 0}  # 初始化 `[word : token]` ，后期 tokenize 语料库就是用该词典。
    embeddings_index = {}  # 初始化`[word : vector]`字典

    for i in range(len(vocab_list)):
        # print(i)
        word = vocab_list[i]  # 每个词语
        # word_index[word] = i + 1 # 词语：序号
        embeddings_index[word] = Word2VecModel.wv[word]  # 词语：词向量
        # embeddings_matrix[i + 1] = Word2VecModel.wv[word]  # 词向量矩阵
    return embeddings_index


def load_fasttext_embeddings():
    glove_dir = r"E:\CS\MLT\glove.6B"
    embeddings_index = {}
    f = open(os.path.join(glove_dir, "glove.6B.50d.txt"), encoding="utf-8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
    f.close()
    print("Found %s word vectors." % len(embeddings_index))
    return embeddings_index


def create_embeddings_matrix(embeddings_index, vocabulary, embedding_dim=100):
    embeddings_matrix = np.random.rand(len(vocabulary) + 1, embedding_dim)
    for i, word in enumerate(vocabulary):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector
    print("Matrix shape: {}".format(embeddings_matrix.shape))
    return embeddings_matrix


embeddings_index = load_google()
embeddings_matrix = create_embeddings_matrix(embeddings_index, word_index, 300)

In [None]:
import gensim

Word2VecModel = gensim.models.KeyedVectors.load_word2vec_format(
    r"E:\CS\MLT\GoogleNews-vectors-negative300.bin", binary=True
)

vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]  # 存储 所有的 词语

word_index = {" ": 0}  # 初始化 `[word : token]` ，后期 tokenize 语料库就是用该词典。
word_vector = {}  # 初始化`[word : vector]`字典

for i in range(len(vocab_list)):
    # print(i)
    word = vocab_list[i]  # 每个词语
    #     word_index[word] = i + 1 # 词语：序号
    #     word_vector[word] = Word2VecModel.wv[word] # 词语：词向量
    embeddings_matrix[i + 1] = Word2VecModel.wv[word]  # 词向量矩阵