In [1]:
import re
import numpy as np
import tensorflow as tf
from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams, make_sampling_table
t = Tokenizer(wakati=True)

import json
import pandas as pd

In [8]:
t = Tokenizer(wakati=True)
def tokenize(text):
    return t.tokenize(text)


def normalize_number(text, reduce=False):
    if reduce:
        normalize_text = re.sub(r"\d+", "0", text)
    else:
        normalize_text = re.sub(r"\d", "0", text)
    return normalize_text


def build_vocabulary(texts, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=num_words, oov_token="<UNK>"
    )
    tokenizer.fit_on_texts(texts)
    return tokenizer


def create_dataset(text, vocab, num_words, window_size, negative_samples):
    data = vocab.texts_to_sequences([text]).pop()
    sampling_table = make_sampling_table(num_words)
    couples, labels = skipgrams(data, num_words,
                                window_size=window_size,
                                negative_samples=negative_samples,
                                sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target = np.reshape(word_target, (-1, 1))
    word_context = np.reshape(word_context, (-1, 1))
    labels = np.asarray(labels)
    return [word_target, word_context], labels


def preprocess_dataset(texts):
    texts = [" ".join(tokenize(str(text))) for text in texts]
    return texts


In [3]:
df_tweets = pd.read_csv("data/df_tweets",index_col=0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
df_tweets.head()

Unnamed: 0,index,category,id,posi_and_nega,posi,nega,neutral,Irrelevant,text
0,10026.0,10000.0,5.224078e+17,0.0,0.0,1.0,0.0,0.0,xperiaでスクフェス糞\n反応遅いんだよ糞が
1,10027.0,10000.0,5.22408e+17,0.0,0.0,1.0,1.0,0.0,夏春都が持ってたエクスペリアも今使うには辛い
2,10032.0,10000.0,5.224091e+17,0.0,0.0,0.0,1.0,0.0,少し時間空いちゃいましたが、Xperia Z3のカメラ機能について、ちょっとだけですけどまと...
3,10033.0,10000.0,5.224091e+17,0.0,0.0,0.0,0.0,1.0,日向「研磨おたおめー。これプレゼント!!」\n孤爪「こ、これは」\n日向「ビビった?」\n孤...
4,10035.0,10000.0,5.224102e+17,0.0,1.0,0.0,1.0,0.0,今ブログ記事書いてて\nしゃーくって打ったら\n予測変換にSHARK 2nd Seasonて...


In [5]:
df_tweets = df_tweets.drop(["index", "id"], axis=1)

In [6]:
df_tweets.head()

Unnamed: 0,category,posi_and_nega,posi,nega,neutral,Irrelevant,text
0,10000.0,0.0,0.0,1.0,0.0,0.0,xperiaでスクフェス糞\n反応遅いんだよ糞が
1,10000.0,0.0,0.0,1.0,1.0,0.0,夏春都が持ってたエクスペリアも今使うには辛い
2,10000.0,0.0,0.0,0.0,1.0,0.0,少し時間空いちゃいましたが、Xperia Z3のカメラ機能について、ちょっとだけですけどまと...
3,10000.0,0.0,0.0,0.0,0.0,1.0,日向「研磨おたおめー。これプレゼント!!」\n孤爪「こ、これは」\n日向「ビビった?」\n孤...
4,10000.0,0.0,1.0,0.0,1.0,0.0,今ブログ記事書いてて\nしゃーくって打ったら\n予測変換にSHARK 2nd Seasonて...


In [9]:
df_tweets["text"] = preprocess_dataset(df_tweets["text"])

In [23]:
len(df_tweets)

132333

In [24]:
len(df_tweets.dropna(how='any'))

132308

In [25]:
df_tweets = df_tweets.dropna(how='any')

In [30]:
df_tweets

Unnamed: 0,category,posi_and_nega,posi,nega,neutral,Irrelevant,text
0,10000.0,0.0,0.0,1.0,0.0,0.0,xperia で スクフェス 糞 \n 反応 遅い ん だ よ 糞 が
1,10000.0,0.0,0.0,1.0,1.0,0.0,夏 春 都 が 持っ て た エクスペリア も 今 使う に は 辛い
2,10000.0,0.0,0.0,0.0,1.0,0.0,少し 時間 空い ちゃ い まし た が 、 Xperia Z 3 の カメラ 機能 に...
3,10000.0,0.0,0.0,0.0,0.0,1.0,日向 「 研磨 お た お め ー 。 これ プレゼント !!」 \n 孤 爪 「 こ 、 ...
4,10000.0,0.0,1.0,0.0,1.0,0.0,今 ブログ 記事 書い て て \n しゃ ー くっ て 打っ たら \n 予測 変換 に ...
...,...,...,...,...,...,...,...
132303,10025.0,0.0,0.0,0.0,0.0,1.0,そう いえ ば 「 Ah －」 の 「 は 」 の とこ 以外 ust めっちゃ ベタ 打ち...
132304,10025.0,0.0,0.0,0.0,1.0,0.0,ベット に 跳ね上がり 、 掃除 し て くれる ルンバ が 出 たら 買う かも しれ な...
132305,10025.0,0.0,0.0,0.0,1.0,0.0,さ ー って 。 がんばる ん ば ー 。 \n ルンバ ほしい な 。 笑
132306,10025.0,0.0,0.0,0.0,0.0,1.0,ルンバ さん が 水色 の ウィッグ を … 光 加減 で だ けど 水色 の ウィッグ...


In [27]:
df_tweets.to_csv("data/df_tokenized_tweets")

In [3]:
df_tweets = pd.read_csv("data/df_tokenized_tweets", index_col=0)

In [4]:
df_tweets

Unnamed: 0,category,posi_and_nega,posi,nega,neutral,Irrelevant,text
0,10000.0,0.0,0.0,1.0,0.0,0.0,xperia で スクフェス 糞 \n 反応 遅い ん だ よ 糞 が
1,10000.0,0.0,0.0,1.0,1.0,0.0,夏 春 都 が 持っ て た エクスペリア も 今 使う に は 辛い
2,10000.0,0.0,0.0,0.0,1.0,0.0,少し 時間 空い ちゃ い まし た が 、 Xperia Z 3 の カメラ 機能 に...
3,10000.0,0.0,0.0,0.0,0.0,1.0,日向 「 研磨 お た お め ー 。 これ プレゼント !!」 \n 孤 爪 「 こ 、 ...
4,10000.0,0.0,1.0,0.0,1.0,0.0,今 ブログ 記事 書い て て \n しゃ ー くっ て 打っ たら \n 予測 変換 に ...
...,...,...,...,...,...,...,...
132303,10025.0,0.0,0.0,0.0,0.0,1.0,そう いえ ば 「 Ah －」 の 「 は 」 の とこ 以外 ust めっちゃ ベタ 打ち...
132304,10025.0,0.0,0.0,0.0,1.0,0.0,ベット に 跳ね上がり 、 掃除 し て くれる ルンバ が 出 たら 買う かも しれ な...
132305,10025.0,0.0,0.0,0.0,1.0,0.0,さ ー って 。 がんばる ん ば ー 。 \n ルンバ ほしい な 。 笑
132306,10025.0,0.0,0.0,0.0,0.0,1.0,ルンバ さん が 水色 の ウィッグ を … 光 加減 で だ けど 水色 の ウィッグ...


In [45]:
df_tweets = df_tweets.drop(df_tweets.index[df_tweets["Irrelevant"]==1])

Unnamed: 0,category,posi_and_nega,posi,nega,neutral,Irrelevant,text
3,10000.0,0.0,0.0,0.0,0.0,1.0,日向 「 研磨 お た お め ー 。 これ プレゼント !!」 \n 孤 爪 「 こ 、 ...
6,10000.0,0.0,0.0,0.0,1.0,1.0,結局 の ところ 、 林檎 房 だっ た わ 。 Xperia は 、 コンパクト タブレッ...
7,10000.0,0.0,0.0,0.0,0.0,1.0,もはや tablet P は 黒 歴史 な の かしら ・ ・ ・ ？ # xper...
44,10000.0,0.0,0.0,0.0,0.0,1.0,個人 的 に は 予約 録画 、 セキュリティー システム 、 動画 再生 、 Web 、 ...
53,10000.0,0.0,0.0,0.0,0.0,1.0,メール が と 届い て て Xperia 乗り換え の クーポン だっ た
...,...,...,...,...,...,...,...
132298,10025.0,0.0,0.0,0.0,0.0,1.0,「 黎明 」 って 歌詞 ある と はい ！ 松永 で やり ます ね ！ って すぐ なる...
132299,10025.0,0.0,0.0,0.0,0.0,1.0,サビ の 「 Ah ー 」 だけ 普通 に かっこいい ｿﾞｰ ！ ルンバ さん じゃ ない...
132303,10025.0,0.0,0.0,0.0,0.0,1.0,そう いえ ば 「 Ah －」 の 「 は 」 の とこ 以外 ust めっちゃ ベタ 打ち...
132306,10025.0,0.0,0.0,0.0,0.0,1.0,ルンバ さん が 水色 の ウィッグ を … 光 加減 で だ けど 水色 の ウィッグ...


In [5]:
from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7025887636745171645
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 7921348564178241922
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 2588209033945263661
physical_device_desc: "device: XLA_GPU device"
]


In [12]:


from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from utils import load_fasttext, filter_embeddings
from gensim.models import KeyedVectors

from model import CNNModel
batch_size = 1024
epochs = 10
maxlen = 300
model_path = "model/cnn_model.h5"
num_words = 40000
num_label = 5

#x = preprocess_dataset(x)
x = df_tweets["text"]
y = df_tweets[["posi_and_nega", "posi", "nega", "neutral"
              # , "Irrelevant"
              ]]
y = np.asarray(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
vocab = build_vocabulary(x_train, num_words)
x_train = vocab.texts_to_sequences(x_train)
x_test = vocab.texts_to_sequences(x_test)
x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post")
x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post")

#wv = load_fasttext("data/cc.ja.300.vec.gz")
wv = KeyedVectors.load("model/word2vec.model",mmap='r')
wv = filter_embeddings(wv, vocab.word_index, num_words)

model = CNNModel(num_words, num_label, embeddings=wv).build()
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["acc"])


In [16]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('tokenizer.pickle', 'rb') as handle:
    vocab = pickle.load(handle)

In [18]:
wv = KeyedVectors.load("model/word2vec.model",mmap='r')
wv.most_similar('スマホ',topn=10)

[('スマフォ', 0.6678307056427002),
 ('ガラケー', 0.5868028402328491),
 ('スマホスマホ', 0.5866972804069519),
 ('スマホメイン', 0.5783843994140625),
 ('・・スマホ', 0.5758718848228455),
 ('・スマホ', 0.5709228515625),
 ('ガラケイ', 0.5691261887550354),
 ('スマートフォン', 0.5604338645935059),
 ('ガラゲー', 0.5589156150817871),
 ('タブレット', 0.5540313124656677)]

In [19]:
#wv.save("model/word2vec.model")

In [46]:

callbakcs = [
    EarlyStopping(patience=3),
    ModelCheckpoint(model_path, save_best_only=True)
]

model.fit(
    x=x_train,
    y=y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.5,
    callbacks=callbakcs,
    shuffle=True
)

model = load_model(model_path)
#api = InferenceAPI(model, vocab, preprocess_dataset)
#y_pred = api.predict_from_sequence(x_test)
#print("precision: {:.4f}".format(precision_score(y_test,] y_pred, average="binary")))
#print("recall: {:.4f}".format(recall_score(y_test, y_pred, average="binary")))
#print("f1: {:.4f}".format(f1_score(y_test, y_pred, average="binary")))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [25]:
model_path = "model/cnn_model.h5"
model = load_model(model_path)

In [19]:
test_texts = vocab.sequences_to_texts(x_test)

In [20]:
test_texts

['<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <U

In [59]:
#["posi_and_nega", "posi", "nega", "neutral", "Irrelevant"]
y_test_multi = y_test[y_test.sum(axis=1)==2]
x_test_multi = x_test[y_test.sum(axis=1)==2]

In [63]:
y_test_multi

array([[0., 0., 1., 0., 1.],
       [0., 1., 0., 1., 0.],
       [0., 0., 0., 1., 1.],
       ...,
       [0., 0., 1., 1., 0.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 1., 1.]])

In [60]:
vocab.sequences_to_texts(x_test_multi)

['<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <U

In [64]:
pred_proba = model.predict(x_test_multi)

In [65]:
pred_proba

array([[0.01292442, 0.01384112, 0.3566934 , 0.13807423, 0.47846678],
       [0.01203546, 0.24276087, 0.02492479, 0.6317038 , 0.08857512],
       [0.00175255, 0.00475573, 0.00262516, 0.79656255, 0.19430403],
       ...,
       [0.00213471, 0.01150758, 0.00916477, 0.15152323, 0.8256697 ],
       [0.01245396, 0.3259978 , 0.02402743, 0.32655758, 0.3109632 ],
       [0.01580289, 0.01085561, 0.43431103, 0.39660662, 0.14242384]],
      dtype=float32)

In [10]:
import tensorflow

In [2]:
df = pd.read_csv("data/testdata.manual.2009.06.14.csv")

In [3]:
df

Unnamed: 0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,"@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
3,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
4,4,8,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,@richardebaker no. it is too big. I'm quite ha...
...,...,...,...,...,...,...
492,2,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
493,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
494,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
495,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."
