In [2]:
import pandas as pd
from transformers import BertTokenizer

In [3]:
# Load data
train_df = pd.read_csv("../data/input/train.csv")
test_df = pd.read_csv("../data/input/test.csv")

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Preprocess text

In [4]:
# tweetテキストの余計な文字を削除
import re

def preprocess_text(sentence):
    # URL
    sentence = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+\s*", ' ', sentence)
    # Hash Tag
    sentence = re.sub(r'#[^\s]+\s*', ' ', sentence)
    # アルファベット以外
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # 単一文字
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # 連続する空白を1つの空白に
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [5]:
# test
text = "Check:   a  b v c <http://www//141/aa.1a>#dog https://www/w/w//asf1 # #cat #dog"
print(re.sub(r'http.* ', '', text))
print(re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+\s*", '', text))
print(re.sub(r'#[^\s]+\s*', '', text))
print(preprocess_text(text))

Check:   a  b v c <#dog
Check:   a  b v c <>#dog # #cat #dog
Check:   a  b v c <http://www//141/aa.1a>https://www/w/w//asf1 # 
Check b c 


In [6]:
# train dataのテキストを前処理
train_tweets = train_df["text"].apply(lambda x: preprocess_text(x))

i = 20
print(f"original[{i}]: {train_df['text'][i]}")
print(f"tweets[{i}]: {train_tweets[i]}")

original[20]: this is ridiculous....
tweets[20]: this is ridiculous 


In [7]:
# bert_tokenizerを作成
import bert
import tensorflow_hub as hub

bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
    trainable=False
)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocabulary_file, to_lower_case)

INFO:absl:Using C:\Users\rainc\AppData\Local\Temp\tfhub_modules to cache modules.


In [8]:
# bert_tokenizerでtweetをtoken化、ベクトル化
tokenized_train_tweets = train_tweets.apply(
    lambda x:tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x))
)
print(f"- tokenized_tweets length: {len(tokenized_train_tweets)}")

i = 10
print(f"- original[{i}]: \n {train_tweets[i]}")
print(f"- tokenized[{i}]: \n {tokenized_train_tweets[i]}")

- tokenized_tweets length: 7613
- original[10]: 
 Three people died from the heat wave so far
- tokenized[10]: 
 [2093, 2111, 2351, 2013, 1996, 3684, 4400, 2061, 2521]


## Pretraining Data For Training
To create sentences of equal length, pad sentence within each batch.<br>

In [9]:
# tweetテキスト(tokenized)と正解ラベルのタプルを作る
train_labels = train_df["target"].values
tweets_labels = [
    (tweet, train_labels[i]) for i, tweet in enumerate(tokenized_train_tweets)
]
print(f"\t\t\t([tokenized tweet], label)\ntweets_labels[{i}]: \t{tweets_labels[i]}")

			([tokenized tweet], label)
tweets_labels[10]: 	([2093, 2111, 2351, 2013, 1996, 3684, 4400, 2061, 2521], 1)


In [10]:
import tensorflow as tf

processed_dataset = tf.data.Dataset.from_generator(
    generator=lambda: tweets_labels,# ジェネレータ関数
    output_types=(tf.int32, tf.int32)
)

BATCH_SIZE = 16

# batch毎にpaddingをおこなう
batched_dataset = processed_dataset.padded_batch(
    BATCH_SIZE,
    padded_shapes=((None, ), ())
)

In [11]:
next(iter(batched_dataset))

(<tf.Tensor: shape=(16, 26), dtype=int32, numpy=
 array([[ 2256, 15616,  2024,  1996,  3114,  1997,  2023,  2089, 16455,
          9641,  2149,  2035,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [ 3224,  2543,  2379,  2474,  6902,  3351, 21871,  2243,  2710,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [ 2035,  3901,  2356,  2000,  7713,  1999,  2173,  2024,  2108,
         19488,  2011,  3738,  2053,  2060, 13982,  2030,  7713,  1999,
          2173,  4449,  2024,  3517,     0,     0,     0,     0],
        [ 2111,  4374, 13982,  4449,  1999,  2662,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [ 2074,  2288,  2741,  2023,  6302,  2013, 10090,  2004,  5610,
          2013, 10364,  2015,  2046,  2

In [12]:
import math
TOTAL_BATCHES = math.ceil(len(tweets_labels) / BATCH_SIZE)
VALID_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
valid_data = batched_dataset.take(VALID_BATCHES)
train_data = batched_dataset.skip(VALID_BATCHES)

## Creating the Model

In [22]:
from tensorflow.keras import layers

"""
    tf.keras.Modelを継承したクラス
    Modelクラスを継承して、callメソッドで順伝播を実装することにより、独自のカスタムモデルを作成
    Modelクラスの継承APIはKeras2.2.0で導入
    下記メソッド、属性は派生したモデルでは使えない
        - model.inputs, model.outputs
        - model.to_yaml(), model.to_json()
        - model.get_config(), model.save()
    https://keras.io/ja/models/about-keras-models/
"""
class TEXT_MODEL(tf.keras.Model):
    # レイヤーの定義
    def __init__(
        self,
        vocabulary_size,
        embedding_dimensions = 128,
        cnn_filters = 50,
        dnn_units = 512,
        model_output_classes = 2,
        dropout_rate = 0.1,
        training=True,
        name="text_model"
    ):
        super(TEXT_MODEL, self).__init__(name=name)
        # super(親クラスのオブジェクト, self).親クラスのメソッド
        
        self.embedding = layers.Embedding(
            input_dim=vocabulary_size,
            output_dim=embedding_dimensions
        )
        self.cnn_layer1 = layers.Conv1D(
            filters=cnn_filters,
            kernel_size=2,
            padding="valid",
            activation="relu"
        )
        self.cnn_layer2 = layers.Conv1D(
            filters=cnn_filters,
            kernel_size=3,
            padding="valid",
            activation="relu"
        )
        self.cnn_layer3 = layers.Conv1D(
            filters=cnn_filters,
            kernel_size=4,
            padding="valid",
            activation="relu"
        )
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        
        if model_output_classes == 2:
            self.last_dense = layers.Dense(
                units=1,
                activation="sigmoid"
            )
        else:
            self.last_dense = layers.Dense(
                units=model_output_classes,
                activation="softmax"
            )
            
    # 順伝播を記述
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l)
        l_1 = self.pool(l_1)
        l_2 = self.cnn_layer2(l)
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3)
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [14]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 5

text_model = TEXT_MODEL(
    vocabulary_size = VOCAB_LENGTH,
    embedding_dimensions=EMB_DIM,
    cnn_filters=CNN_FILTERS,
    dnn_units=DNN_UNITS,
    model_output_classes=OUTPUT_CLASSES,
    dropout_rate = DROPOUT_RATE
)

if OUTPUT_CLASSES == 2:
    text_model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )
else:
    text_model.compile(
        loss="sparse_categorical_crossentropy",
        optmizer="adam",
        metrics=["sparse_categorical_accuracy"]
    )

In [17]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x18c21910fc8>

In [18]:
results = text_model.evaluate(valid_data)

print(f"loss: {results[0]:.4f}")
print(f"accuracy: {results[1]:.4f}")

loss: 1.0627
accuracy: 0.6622


# Create Submission

In [103]:
test_tweets = []
for tweet in test_df["text"]:
    test_tweets.append(preprocess_text(tweet))
print(test_tweets[10])

No don like cold 


In [104]:
tokenized_test_tweets = [
    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweet)) for tweet in test_tweets
]
print(tokenized_test_tweets[10])

[2053, 2123, 2066, 3147]


In [122]:
from keras.preprocessing.sequence import pad_sequences

X_test = pad_sequences(tokenized_test_tweets, maxlen=60)

In [132]:
pred = text_model.predict(X_test)

In [134]:
import numpy as np
np.where(pred > 0.5, 1, 0)

array([[0],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [141]:
sample_submission = pd.read_csv("../data/input/sample_submission.csv")
sample_submission["target"] = np.where(pred>0.5, 1, 0)
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [142]:
import datetime
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

sample_submission.to_csv("../data/output/sub_" + dt + "_BerTokenMLP.csv", index=False)