## 数据集

train： 
- 英文文本+6种该评论不健康类型的概率
- 当前评论属于那种分类，则label为1，可同时多个为1

提交格式：   

| id | toxic | severe_toxic | obscene | threat | insult | identity_hate |    
| --- | --- |  --- | --- |  --- | --- |  --- |

In [141]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import wordpunct_tokenize                               # 分词

# 可视化
import matplotlib.pyplot as plt                
import seaborn as sns

from gensim.models import Word2Vec                                         # 词向量
from sklearn.manifold import TSNE                                          # tsne降维
from keras.layers import TimeDistributed, Dropout                          # 

from keras.models import Sequential# , Model                               # CNN初始化
from keras.layers import Dense, Dropout, Activation, Flatten, Input        # CNN网络层，Dense net 等
from keras.layers.convolutional import Conv2D, MaxPooling2D                # 卷积操作，池化操作
from keras.applications.vgg16 import VGG16, preprocess_input               # VGG16 模型、输入预处理


from keras.optimizers import SGD                                           # 优化器SGD操作
from keras.optimizers import RMSprop, Adam                                 # 优化算法RMSprop，Adam


from keras.preprocessing.image import ImageDataGenerator                   # 图像矩阵转换
from keras.preprocessing import image                                      # 图像处理操作


from sklearn.model_selection import train_test_split                       # 数据集划分

%matplotlib inline

# tensorflow环境
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
set_session(tf.Session(config=config))

### 数据集导入

In [3]:
df_trainOrg = pd.read_csv('../Data/train_july.csv')
df_trainOrg.tail(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
143611,0c7f3197cb2998d4,"Hello Cindamuse,\n\n1. Please review/edit the ...",0,0,0,0,0,0
143612,6de99309da264aee,I've restored the links to the specific subjec...,0,0,0,0,0,0
143613,7390c4a88a58f72f,"""\n\n An-cap FAQ \n\nWhen making a list of ana...",0,0,0,0,0,0


In [4]:
df_testOrg = pd.read_csv('../Data/test_july.csv')
df_testOrg.tail(3)

Unnamed: 0,id,comment_text
15954,020edaf0f24e861f,"""\n\n""""Actions in la-la land have real world c..."
15955,cf56bc027896fc22,""", 8 April 2009 (UTC)\n\nThanks for your conti..."
15956,653ad098cf212e80,About personal attacks...I wasn't the one goin...


In [5]:
int_trainLowLen, int_trainColLen = df_trainOrg.shape
int_testLowLen, int_testColLen = df_testOrg.shape

(int_trainLowLen, int_trainColLen), (int_testLowLen, int_testColLen)

((143614, 8), (15957, 2))

In [6]:
ser_trainIndexs = df_trainOrg.iloc[:,0]
df_trainData = df_trainOrg.iloc[:, 1:]

df_trainData.tail(3)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
143611,"Hello Cindamuse,\n\n1. Please review/edit the ...",0,0,0,0,0,0
143612,I've restored the links to the specific subjec...,0,0,0,0,0,0
143613,"""\n\n An-cap FAQ \n\nWhen making a list of ana...",0,0,0,0,0,0


In [7]:
df_testData = df_testOrg.iloc[:, 1]
df_testData.tail(3)

15954    "\n\n""Actions in la-la land have real world c...
15955    ", 8 April 2009 (UTC)\n\nThanks for your conti...
15956    About personal attacks...I wasn't the one goin...
Name: comment_text, dtype: object

In [8]:
ser_testIndexs = df_testOrg.iloc[:,0]
ser_testText = df_testOrg.iloc[:,1]

ser_testText.tail(3)

15954    "\n\n""Actions in la-la land have real world c...
15955    ", 8 April 2009 (UTC)\n\nThanks for your conti...
15956    About personal attacks...I wasn't the one goin...
Name: comment_text, dtype: object

In [9]:
list(ser_testText.tail(1))

["About personal attacks...I wasn't the one going around and persecuting IP addresses..."]

In [10]:
df_trainData[df_trainData['toxic'] & df_trainData['severe_toxic']==1].head(3)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
45,Stalking \n\nyou little shit you need to stop ...,1,1,1,0,1,0
84,"F*CK YOU \n\nYou moth*rfucker, if you want me ...",1,1,1,0,1,0
288,"fuck you daniel, you big dickhead 60.241.11.51",1,1,1,0,1,0


In [11]:
df_trainData.iloc[5,:]['comment_text'], df_trainData.iloc[5,2:]

('HELP\n\nMy userpage is SCREWED', severe_toxic     0
 obscene          0
 threat           0
 insult           0
 identity_hate    0
 Name: 5, dtype: object)

### 分词

In [12]:
import string
set_punt = set(string.punctuation)
print(len(set_punt))

32


In [13]:
# 训练集分词
l = []
c = ['cut_words', 'toxic','severe_toxic','obscene','threat','insult','identity_hate']

# label一致化
df_trainCW = pd.DataFrame(columns=c)
for n in c[1:]:
    df_trainCW[n] = df_trainData[n]

# 分词
for sen in df_trainData['comment_text']:
    s_l = wordpunct_tokenize(sen)
    l.append([w for w in s_l if w not in set_punt])
df_trainCW['cut_words'] = l

df_trainCW.head(3)

Unnamed: 0,cut_words,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"[1998, NFC, Championship, This, game, should, ...",0,0,0,0,0,0
1,"[and, I, removed, the, matter, of, fact, state...",0,0,0,0,0,0
2,"[Congratulations, on, making, the, news, and, ...",0,0,0,0,0,0


In [14]:
# 测试集分词
l = []
ser_testCW = pd.Series()

# 分词
for sen in df_testData:
    s_l = wordpunct_tokenize(sen)
    l.append([w for w in s_l if w not in set_punt])
    l.append(wordpunct_tokenize(sen))
ser_testCW = pd.Series(l, name='cut_words')

ser_testCW.head(3)

0    [This, message, is, regarding, the, page, Play...
1    [This, message, is, regarding, the, page, Play...
2    [I, think, this, article, needs, a, little, up...
Name: cut_words, dtype: object

In [15]:
df_testProed = pd.DataFrame(columns=['id', 'cut_words'])
df_testProed['id'] = ser_testIndexs
df_testProed['cut_words'] = ser_testCW
df_testProed.head(3)

Unnamed: 0,id,cut_words
0,612c0c93a5ca94db,"[This, message, is, regarding, the, page, Play..."
1,d5567adaddbc78cb,"[This, message, is, regarding, the, page, Play..."
2,85b845308824202b,"[I, think, this, article, needs, a, little, up..."


In [16]:
from copy import deepcopy

df_trainProed = deepcopy(df_trainCW)
df_trainProed.insert(0, 'id', value=ser_trainIndexs)

df_trainProed.head(3)

Unnamed: 0,id,cut_words,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,07b9e910cc401c3c,"[1998, NFC, Championship, This, game, should, ...",0,0,0,0,0,0
1,a80141fb5bae14ea,"[and, I, removed, the, matter, of, fact, state...",0,0,0,0,0,0
2,1d5c63ba045a058a,"[Congratulations, on, making, the, news, and, ...",0,0,0,0,0,0


In [17]:
# 导出分词结果
from datetime import datetime
t = datetime.now()
s = datetime.strftime(t, '%m%d%H%M')
print(s)

# df_trainProed.to_csv('../Data/CW_train_06082300.csv', sep=',', index=None)
# df_testProed.to_csv('../Data/CW_test_06082300.csv', sep=',', index=None)

06082352


## 单词词向量化

In [62]:
# df_trainProed = pd.read_csv('../Data/CW_train_06082300.csv')
# df_testProed = pd.read_csv('../Data/CW_test_06082300.csv')

In [18]:
df_trainProed.head(3)

Unnamed: 0,id,cut_words,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,07b9e910cc401c3c,"[1998, NFC, Championship, This, game, should, ...",0,0,0,0,0,0
1,a80141fb5bae14ea,"[and, I, removed, the, matter, of, fact, state...",0,0,0,0,0,0
2,1d5c63ba045a058a,"[Congratulations, on, making, the, news, and, ...",0,0,0,0,0,0


In [19]:
df_testProed.head(3)

Unnamed: 0,id,cut_words
0,612c0c93a5ca94db,"[This, message, is, regarding, the, page, Play..."
1,d5567adaddbc78cb,"[This, message, is, regarding, the, page, Play..."
2,85b845308824202b,"[I, think, this, article, needs, a, little, up..."


In [20]:
%%time
# 训练集单词转为50维词向量
model_train = Word2Vec(df_trainProed['cut_words'], size=50)

# 词向量列表vocab_train
vocab_train = model_train.wv.vocab

type(model_train), type(vocab_train)

Wall time: 44.5 s


In [21]:
%%time
# 测试集单词转为50维词向量
model_test = Word2Vec(df_testProed['cut_words'], size=50)

# 词向量列表
vocab_test = model_test.wv.vocab

type(model_test), type(vocab_test)

Wall time: 4.9 s


In [22]:
from datetime import datetime
t = datetime.now()
s = datetime.strftime(t, '%m%d%H%M')
print(s)

# 导出词向量
# model_train.save('../Model/DRM50_%s.model_train.bin'%s)
# model_test.save('../Model/DRM50_%s.model_test.bin'%s)

# 导入模型
# new_model = Word2Vec.load('../Model/model_DRM50.model.bin')

06082353


## 分割数据集

In [23]:
# 导入模型
# model_train = Word2Vec.load('../Model/DRM50_06081800.model_train.bin')
# model_test = Word2Vec.load('../Model/DRM50_06081800.model_test.bin')

type(model_train), type(model_test)

(gensim.models.word2vec.Word2Vec, gensim.models.word2vec.Word2Vec)

In [24]:
# 词向量列表vocab_train
vocab_train = model_train.wv.vocab

vocab_test = model_test.wv.vocab

In [25]:
print(str(vocab_train['1998']))
print(str(vocab_train['matter']))

Vocab(count:237, index:3513, sample_int:4294967296)
Vocab(count:3400, index:355, sample_int:4294967296)


In [26]:
int_trainSize = int(0.7 * int_trainLowLen)
X_train_train, y_train_train = df_trainProed.iloc[:int_trainSize,1], df_trainProed.iloc[:int_trainSize,2:]
X_test_train, y_test_train = df_trainProed.iloc[int_trainSize:,1], df_trainProed.iloc[int_trainSize:,2:]

X_train_train.shape, y_train_train.shape, X_test_train.shape, y_test_train.shape

((100529,), (100529, 6), (43085,), (43085, 6))

In [27]:
X_train_train.head()

0    [1998, NFC, Championship, This, game, should, ...
1    [and, I, removed, the, matter, of, fact, state...
2    [Congratulations, on, making, the, news, and, ...
3    [Be, er, Sheva, North, Railway, Station, An, e...
4    [Speedy, deletion, of, Moviedoody, A, tag, has...
Name: cut_words, dtype: object

In [28]:
# 句子最大最小长度
print('min length', len(min(list(X_train_train)+list(X_test_train), key=len)))
print('max length', len(max(list(X_train_train)+list(X_test_train), key=len)))

min length 1
max length 1411


## id分析

- Vocab(count:237, index:3513, sample_int:4294967296)

In [29]:
%%time
import re
# 自定义方法，用id进行训练
def getWordIndexs(l_tW, voc_t):
    t_l = []
    for sen in l_tW:
        l = []
        for w in sen:
            if w in voc_t.keys():
                r = re.compile('\d*')
                s_l = [n for n in r.findall(str(voc_t[w])) if len(n)]
#             print(s_l)
    
            l.append(int(s_l[1]))
        t_l.append(l)
    return t_l

# test
l = getWordIndexs(list(X_train_train)[:100], vocab_train)
len(list(X_train_train)[0]), len(list(X_train_train)[1]), len(l[0]), len(l[1])

Wall time: 194 ms


In [30]:
X_train_train.shape, X_test_train.shape

((100529,), (43085,))

In [31]:
%%time
# 训练集转为id list
X_train_train_id = getWordIndexs(list(X_train_train), vocab_train)

Wall time: 2min 24s


In [32]:
%%time
# 测试集转为id list
X_test_train_id = getWordIndexs(list(X_test_train), vocab_train)

Wall time: 1min 1s


In [33]:
%%time
# 目标集转为id list
X_test_id = getWordIndexs(list(df_testProed['cut_words']), vocab_train)

Wall time: 23.6 s


In [34]:
print(len(X_train_train_id), len(X_train_train_id[0]))
print(len(X_test_train_id), len(X_test_train_id[0]))
len(X_test_id), len(X_test_id[0])

100529 43
43085 17


(15957, 30)

### 序列填充

In [35]:
# 填充序列（Pad sequences）为长度500
from keras.preprocessing import sequence

max_words = 500
X_train_train_pad = sequence.pad_sequences(X_train_train_id, maxlen=max_words)
X_test_train_pad = sequence.pad_sequences(X_test_train_id, maxlen=max_words)
X_test_pad = sequence.pad_sequences(X_test_id, maxlen=max_words)

In [36]:
X_train_train_pad.shape, X_train_train_pad[0].shape, type(X_train_train_pad[0]), X_test_pad.shape

((100529, 500), (500,), numpy.ndarray, (15957, 500))

### RNN

In [143]:
# 设计情感分析的RNN模型
## 输入是一个最大长度为 max_words的单词序列(技术上说，序列中的整数为单词id)，我们的输出是一个二进制情感标签(0或1)。
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size = 200
model=Sequential()
model.add(Embedding(100529, embedding_size, input_length=max_words))
model.add(LSTM(100))

model.add(Dense(50, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(6, activation='softmax'))

# model.add(TimeDistributed(Dense(64, activation='relu')))
# model.add(TimeDistributed(Dropout(0.2)))
# model.add(TimeDistributed(Dense(label_size, activation='softmax')))

model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 200)          20105800  
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 306       
Total params: 20,231,556
Trainable params: 20,231,556
Non-trainable params: 0
_________________________________________________________________


In [145]:
# 通过指定在训练时使用的损失函数和优化器以及我们想要测量的任何评估指标来编译我们的模型
# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

rmsprop = RMSprop(lr=0.01)
model.compile(loss='binary_crossentropy', optimizer=rmsprop, metrics=['accuracy'])

In [147]:
# 训练
## 必须指定两个重要的训练参数——批处理大小（batch size）和训练周期的数量（number of training epochs），
## 它们与我们的模型体系结构一起决定了总的训练时间
model.fit(X_train_train_pad, y_train_train, validation_data=(X_test_train_pad, y_test_train), batch_size=200, epochs=5)

Train on 100529 samples, validate on 43085 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x85992860>

In [148]:
import pickle
with open('../Model/model_0609_.pkl', 'wb')as f:
    pickle.dump(model, f)

#### 预测

In [37]:
import pickle
with open('../Model/model_0608_9668.pkl', 'rb')as f:
    model_load = pickle.load(f)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [149]:
type(model)

keras.engine.sequential.Sequential

In [150]:
%%time
# 预测，y_test_train
lst_pred_y = model.predict(X_test_train_pad)


Wall time: 4min 27s


In [153]:
lst_pred_y[10:20]

array([[1.82848200e-01, 1.60511643e-01, 1.65502951e-01, 1.63118601e-01,
        1.63074300e-01, 1.64944291e-01],
       [1.82906970e-01, 1.59937710e-01, 1.65590256e-01, 1.63054690e-01,
        1.63509399e-01, 1.65000975e-01],
       [1.81877807e-01, 1.60251856e-01, 1.65426418e-01, 1.63436890e-01,
        1.63626999e-01, 1.65379971e-01],
       [7.87857413e-01, 1.35413697e-03, 1.76545326e-02, 1.77339297e-02,
        1.62244052e-01, 1.31559698e-02],
       [8.35841835e-01, 5.97282451e-07, 1.22153135e-02, 7.08516694e-08,
        1.51872486e-01, 6.97427458e-05],
       [1.85793012e-01, 1.59601241e-01, 1.65704131e-01, 1.62240148e-01,
        1.62756830e-01, 1.63904622e-01],
       [1.89592183e-01, 1.54954761e-01, 1.67352289e-01, 1.63048029e-01,
        1.63627610e-01, 1.61425143e-01],
       [2.69826561e-01, 1.15423135e-01, 1.53852746e-01, 1.43993840e-01,
        1.75964937e-01, 1.40938789e-01],
       [2.04441547e-01, 1.46692440e-01, 1.70313820e-01, 1.61634475e-01,
        1.62613675e-01, 

In [154]:
# y_test_train,lst_pred_y
y_test_train[10:20]

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
100539,0,0,0,0,0,0
100540,0,0,0,0,0,0
100541,0,0,0,0,0,0
100542,1,1,1,1,1,0
100543,1,0,0,0,1,0
100544,0,0,0,0,0,0
100545,0,0,0,0,0,0
100546,0,0,0,0,0,0
100547,0,0,0,0,0,0
100548,0,0,0,0,0,0


In [49]:
%%time
# 预测
lst_pred = model.predict(X_test_pad)

Wall time: 1min 43s


In [50]:
print(lst_pred[:10])

[[9.8327398e-03 6.3413382e-04 2.7644634e-03 6.4554811e-04 3.0838251e-03
  1.0440946e-03]
 [9.2369020e-03 6.1842799e-04 2.6097894e-03 6.2918663e-04 2.9490888e-03
  1.0091364e-03]
 [1.0663837e-02 6.2063336e-04 2.9734671e-03 6.6053867e-04 3.2097399e-03
  1.0970235e-03]
 [1.1783272e-02 6.6518784e-04 3.2051206e-03 6.9475174e-04 3.4865141e-03
  1.1422038e-03]
 [4.0790141e-03 3.8927794e-04 1.5596151e-03 6.8193674e-04 1.4919043e-03
  6.3937902e-04]
 [3.7895441e-03 3.8966537e-04 1.4776886e-03 6.6462159e-04 1.4690757e-03
  6.0981512e-04]
 [9.1635561e-01 2.5296253e-01 8.3488011e-01 7.1524084e-02 8.0582464e-01
  1.2031531e-01]
 [9.1497743e-01 3.0999711e-01 8.3989817e-01 9.0931535e-02 8.0369341e-01
  1.4614975e-01]
 [1.3026327e-02 7.5641274e-04 3.2509267e-03 7.4303150e-04 3.9570928e-03
  1.1475980e-03]
 [1.3978660e-02 8.0808997e-04 3.4156442e-03 7.7527761e-04 4.2277575e-03
  1.1887848e-03]]


In [60]:
type(lst_pred), type(lst_pred[6]), lst_pred[6], lst_pred[6][0]

(numpy.ndarray,
 numpy.ndarray,
 array([0.9163556 , 0.25296253, 0.8348801 , 0.07152408, 0.80582464,
        0.12031531], dtype=float32),
 0.9163556)

#### 格式

In [133]:
l_r = []
node = 0.9
for res in lst_pred:
    s_l_r = []
    for n in res:
        if n>node:
            s_l_r.append(1)
        else:
            s_l_r.append(0)
    l_r.append(s_l_r)

In [134]:
l_r[:10]

[[0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0]]

In [135]:
df_result = pd.DataFrame(columns=['id', 'toxic','severe_toxic','obscene','threat','insult','identity_hate'])

df_result['id'] = ser_testIndexs

df_result.head(3)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,612c0c93a5ca94db,,,,,,
1,d5567adaddbc78cb,,,,,,
2,85b845308824202b,,,,,,


In [136]:
df =  deepcopy(df_result).iloc[6:10]
l = l_r[6:10]
print(df)
print(l)

lst_labelNames = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

for i in range(len(l)):
    for j in range(len(lst_labelNames)):
#         print(i, j, end='\t')
        print(l[i][j],end='\t')
        df.iloc[i,j+1] = l[i][j]
    print()
df

                 id toxic severe_toxic obscene threat insult identity_hate
6  6363aa197bb2e5de   NaN          NaN     NaN    NaN    NaN           NaN
7  c76b950dc55f58b7   NaN          NaN     NaN    NaN    NaN           NaN
8  40434968e59e4ecc   NaN          NaN     NaN    NaN    NaN           NaN
9  b12e69c34d679a8e   NaN          NaN     NaN    NaN    NaN           NaN
[[1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
1	0	0	0	0	0	
1	0	0	0	0	0	
0	0	0	0	0	0	
0	0	0	0	0	0	


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,6363aa197bb2e5de,1,0,0,0,0,0
7,c76b950dc55f58b7,1,0,0,0,0,0
8,40434968e59e4ecc,0,0,0,0,0,0
9,b12e69c34d679a8e,0,0,0,0,0,0


In [137]:
%%time
lst_labelNames = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
for i in range(len(l_r)):
    for j in range(len(lst_labelNames)):
        df_result.iloc[i,j+1] = l_r[i][j]

print(df_result.head(3))

                 id toxic severe_toxic obscene threat insult identity_hate
0  612c0c93a5ca94db     0            0       0      0      0             0
1  d5567adaddbc78cb     0            0       0      0      0             0
2  85b845308824202b     0            0       0      0      0             0
Wall time: 1min 34s


In [138]:
df_result.iloc[6:10]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,6363aa197bb2e5de,1,0,0,0,0,0
7,c76b950dc55f58b7,1,0,0,0,0,0
8,40434968e59e4ecc,0,0,0,0,0,0
9,b12e69c34d679a8e,0,0,0,0,0,0


In [139]:
df_result.to_csv('../Result/result_06090050.csv', sep=',', index=None)

### 单列预测

'toxic', 'severe_toxic','obscene','threat','insult','identity_hate'

In [162]:
y_train_train_toxic                 = y_train_train.iloc[:, 0]
y_train_train_severe_toxic          = y_train_train.iloc[:, 1]
y_train_train_obscene               = y_train_train.iloc[:, 2]
y_train_train_threat                = y_train_train.iloc[:, 3]
y_train_train_insult                = y_train_train.iloc[:, 4]
y_train_train_identity_hate         = y_train_train.iloc[:, 5]

In [164]:
print(y_train_train_toxic.name,
y_train_train_severe_toxic.name,
y_train_train_obscene.name,
y_train_train_threat.name,
y_train_train_insult.name,
y_train_train_identity_hate.name)

toxic severe_toxic obscene threat insult identity_hate


In [171]:
y_test_train_toxic                 = y_test_train.iloc[:, 0]
y_test_train_severe_toxic          = y_test_train.iloc[:, 1]
y_test_train_obscene               = y_test_train.iloc[:, 2]
y_test_train_threat                = y_test_train.iloc[:, 3]
y_test_train_insult                = y_test_train.iloc[:, 4]
y_test_train_identity_hate         = y_test_train.iloc[:, 5]

In [172]:
print(y_test_train_toxic.name,
y_test_train_severe_toxic.name,
y_test_train_obscene.name,
y_test_train_threat.name,
y_test_train_insult.name,
y_test_train_identity_hate.name)

toxic severe_toxic obscene threat insult identity_hate


In [191]:
list(y_train_train_toxic)[:10]

[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]

#### # toxic

In [200]:
embedding_size = 100
model_toxic =Sequential()
model_toxic.add(Embedding(100529, embedding_size, input_length=max_words))
model_toxic.add(LSTM(300))

model_toxic.add(Dense(1, activation='softmax'))


model_toxic.summary()

ValueError: Input 0 is incompatible with layer lstm_10: expected ndim=3, found ndim=2

In [198]:
# model_toxic.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

rmsprop = RMSprop(lr=0.01)
model_toxic.compile(loss='binary_crossentropy', optimizer=rmsprop, metrics=['accuracy'])

In [199]:
model_toxic.fit(X_train_train_pad, list(y_train_train_toxic), validation_data=(X_test_train_pad, list(y_test_train_toxic)), 
                batch_size=200, epochs=1)

Train on 100529 samples, validate on 43085 samples
Epoch 1/1
  2000/100529 [..............................] - ETA: 1:53:09 - loss: 14.4199 - acc: 0.0955

KeyboardInterrupt: 