# LSTM Text classification using Tensorflow 2.0 Alpha

# Tensorflow 2.0 Alpha important information:
- [Official TF2 Alpha](https://www.tensorflow.org/alpha)
- [New Features in TF 2](https://medium.com/tensorflow/whats-coming-in-tensorflow-2-0-d3663832e9b8)
- [Standardizing on Keras for TF2](https://medium.com/tensorflow/standardizing-on-keras-guidance-on-high-level-apis-in-tensorflow-2-0-bad2b04c819a)
- [TF2 GPU installation guide](https://www.tensorflow.org/install/gpu)

# Importing Libraries

In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

#### Check Tensorflow Version

In [2]:
print(tf.__version__)

2.0.0-alpha0


#### Check GPU detection by tensorflow

In [3]:
x = tf.random.uniform([3, 3])

print("Is there a GPU available: "),
print(tf.test.is_gpu_available())

print("Is the Tensor on GPU #0:  "),
print(x.device.endswith('GPU:0'))

print("Device name: {}".format((x.device)))

Is there a GPU available: 
True
Is the Tensor on GPU #0:  
True
Device name: /job:localhost/replica:0/task:0/device:GPU:0


#### Check if Eager execution is running
- Eager execution enables a more interactive frontend to TensorFlow, the details of which we will discuss much later.
- [Eager basics official guide](https://www.tensorflow.org/tutorials/eager/eager_basics)

In [4]:
print(tf.executing_eagerly())

True


In [5]:
from tensorflow.keras import layers

print(tf.keras.__version__)

2.2.4-tf


# Importing data

In [6]:
df_train  = pd.read_csv("../input/train.csv")
df_test = pd.read_csv('../input/test.csv')

X_train, X_test  = train_test_split(df_train, test_size=0.1, random_state=2019)

In [7]:
df_train[df_train.target==1]

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,Has the United States become the largest dictatorship in the world?,1
30,00013ceca3f624b09f42,Which babies are more sweeter to their parents? Dark skin babies or light skin babies?,1
110,0004a7fcb2bf73076489,If blacks support school choice and mandatory sentencing for criminals why don't they vote Republican?,1
114,00052793eaa287aff1e1,"I am gay boy and I love my cousin (boy). He is sexy, but I dont know what to do. He is hot, and I want to see his di**. What should I do?",1
115,000537213b01fd77b58a,Which races have the smallest penis?,1
119,00056d45a1ce63856fc6,Why do females find penises ugly?,1
127,0005de07b07a17046e27,How do I marry an American woman for a Green Card? How much do they charge?,1
144,00068875d7c82a5bcf88,"Why do Europeans say they're the superior race, when in fact it took them over 2,000 years until mid 19th century to surpass China's largest economy?",1
156,0006ffd99a6599ff35b3,Did Julius Caesar bring a tyrannosaurus rex on his campaigns to frighten the Celts into submission?,1
167,00075f7061837807c69f,In what manner has Republican backing of 'states rights' been hypocritical and what ways have they actually restricted the ability of states to make their own laws?,1


# Configuration

In [8]:
# config values
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

# Data Preparation

In [9]:
y_train, y_test = X_train['target'].values, X_test['target'].values

X_train = X_train['question_text'].fillna('_NA_').values
X_test = X_test['question_text'].fillna('_NA_').values
X_submission = df_test['question_text'].fillna('_NA_').values

In [10]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_submission = tokenizer.texts_to_sequences(X_submission)

In [11]:
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_submission = pad_sequences(X_submission, maxlen=maxlen)

In [27]:
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

In [72]:
model1 = Sequential()
model1.add(Embedding(max_features, embed_size, input_length=maxlen))
model1.add(Bidirectional(CuDNNLSTM(128, return_sequences=True)))
model1.add(GlobalMaxPool1D())
model1.add(Dropout(0.2))
model1.add(Dense(64, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(32, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model1.summary()

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_21 (Bidirectio (None, 100, 256)          440320    
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 256)               0         
_________________________________________________________________
dropout_26 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 64)                16448     
_________________________________________________________________
dropout_27 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_29 (Dense)             (None, 32)              

In [73]:
model1.fit(X_train, y_train, batch_size=512, epochs=2, validation_data=(X_test, y_test))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2


  90624/1175509 [=>............................] - ETA: 1:00:11 - loss: 0.6979 - accuracy: 0.335 - ETA: 39:48 - loss: 0.6796 - accuracy: 0.6348  - ETA: 34:03 - loss: 0.6601 - accuracy: 0.742 - ETA: 30:06 - loss: 0.6385 - accuracy: 0.788 - ETA: 28:17 - loss: 0.6120 - accuracy: 0.817 - ETA: 31:51 - loss: 0.5805 - accuracy: 0.838 - ETA: 30:12 - loss: 0.5477 - accuracy: 0.851 - ETA: 28:59 - loss: 0.5096 - accuracy: 0.863 - ETA: 27:50 - loss: 0.4805 - accuracy: 0.871 - ETA: 26:32 - loss: 0.4623 - accuracy: 0.876 - ETA: 26:12 - loss: 0.4498 - accuracy: 0.881 - ETA: 25:06 - loss: 0.4272 - accuracy: 0.888 - ETA: 24:46 - loss: 0.4173 - accuracy: 0.891 - ETA: 23:52 - loss: 0.4074 - accuracy: 0.894 - ETA: 23:45 - loss: 0.3958 - accuracy: 0.897 - ETA: 23:04 - loss: 0.3877 - accuracy: 0.900 - ETA: 23:01 - loss: 0.3791 - accuracy: 0.902 - ETA: 22:25 - loss: 0.3703 - accuracy: 0.905 - ETA: 22:27 - loss: 0.3623 - accuracy: 0.907 - ETA: 21:55 - loss: 0.3584 - accuracy: 0.908 - ETA: 21:55 - loss: 0.3535

 185856/1175509 [===>..........................] - ETA: 9:26 - loss: 0.1988 - accuracy: 0.93 - ETA: 9:24 - loss: 0.1986 - accuracy: 0.93 - ETA: 9:23 - loss: 0.1982 - accuracy: 0.93 - ETA: 9:21 - loss: 0.1978 - accuracy: 0.93 - ETA: 9:19 - loss: 0.1973 - accuracy: 0.93 - ETA: 9:17 - loss: 0.1969 - accuracy: 0.93 - ETA: 9:15 - loss: 0.1966 - accuracy: 0.93 - ETA: 9:13 - loss: 0.1964 - accuracy: 0.93 - ETA: 9:11 - loss: 0.1960 - accuracy: 0.93 - ETA: 9:10 - loss: 0.1958 - accuracy: 0.93 - ETA: 9:08 - loss: 0.1956 - accuracy: 0.93 - ETA: 9:06 - loss: 0.1953 - accuracy: 0.93 - ETA: 9:04 - loss: 0.1950 - accuracy: 0.93 - ETA: 9:03 - loss: 0.1947 - accuracy: 0.93 - ETA: 9:01 - loss: 0.1946 - accuracy: 0.93 - ETA: 8:59 - loss: 0.1943 - accuracy: 0.93 - ETA: 8:57 - loss: 0.1939 - accuracy: 0.93 - ETA: 8:56 - loss: 0.1937 - accuracy: 0.93 - ETA: 8:54 - loss: 0.1936 - accuracy: 0.93 - ETA: 8:52 - loss: 0.1933 - accuracy: 0.93 - ETA: 8:51 - loss: 0.1930 - accuracy: 0.93 - ETA: 8:49 - loss: 0.1930 





















Epoch 2/2


  95232/1175509 [=>............................] - ETA: 5:38 - loss: 0.0741 - accuracy: 0.97 - ETA: 5:07 - loss: 0.0755 - accuracy: 0.96 - ETA: 4:54 - loss: 0.0926 - accuracy: 0.96 - ETA: 4:46 - loss: 0.0873 - accuracy: 0.96 - ETA: 4:43 - loss: 0.0844 - accuracy: 0.96 - ETA: 4:40 - loss: 0.0837 - accuracy: 0.96 - ETA: 4:38 - loss: 0.0836 - accuracy: 0.96 - ETA: 4:36 - loss: 0.0863 - accuracy: 0.96 - ETA: 4:36 - loss: 0.0903 - accuracy: 0.96 - ETA: 4:35 - loss: 0.0920 - accuracy: 0.96 - ETA: 4:34 - loss: 0.0944 - accuracy: 0.96 - ETA: 4:34 - loss: 0.0938 - accuracy: 0.96 - ETA: 4:34 - loss: 0.0956 - accuracy: 0.96 - ETA: 4:33 - loss: 0.0953 - accuracy: 0.96 - ETA: 4:33 - loss: 0.0945 - accuracy: 0.96 - ETA: 4:33 - loss: 0.0946 - accuracy: 0.96 - ETA: 4:33 - loss: 0.0941 - accuracy: 0.96 - ETA: 4:32 - loss: 0.0929 - accuracy: 0.96 - ETA: 4:32 - loss: 0.0926 - accuracy: 0.96 - ETA: 4:32 - loss: 0.0930 - accuracy: 0.96 - ETA: 4:31 - loss: 0.0916 - accuracy: 0.96 - ETA: 4:31 - loss: 0.0936 

 190464/1175509 [===>..........................] - ETA: 4:12 - loss: 0.0954 - accuracy: 0.96 - ETA: 4:12 - loss: 0.0954 - accuracy: 0.96 - ETA: 4:12 - loss: 0.0954 - accuracy: 0.96 - ETA: 4:12 - loss: 0.0955 - accuracy: 0.96 - ETA: 4:12 - loss: 0.0956 - accuracy: 0.96 - ETA: 4:12 - loss: 0.0957 - accuracy: 0.96 - ETA: 4:11 - loss: 0.0958 - accuracy: 0.96 - ETA: 4:11 - loss: 0.0959 - accuracy: 0.96 - ETA: 4:11 - loss: 0.0958 - accuracy: 0.96 - ETA: 4:11 - loss: 0.0959 - accuracy: 0.96 - ETA: 4:11 - loss: 0.0959 - accuracy: 0.96 - ETA: 4:11 - loss: 0.0960 - accuracy: 0.96 - ETA: 4:11 - loss: 0.0959 - accuracy: 0.96 - ETA: 4:10 - loss: 0.0958 - accuracy: 0.96 - ETA: 4:10 - loss: 0.0958 - accuracy: 0.96 - ETA: 4:10 - loss: 0.0960 - accuracy: 0.96 - ETA: 4:10 - loss: 0.0960 - accuracy: 0.96 - ETA: 4:10 - loss: 0.0960 - accuracy: 0.96 - ETA: 4:10 - loss: 0.0960 - accuracy: 0.96 - ETA: 4:10 - loss: 0.0961 - accuracy: 0.96 - ETA: 4:10 - loss: 0.0961 - accuracy: 0.96 - ETA: 4:09 - loss: 0.0960 























<tensorflow.python.keras.callbacks.History at 0x26303e1c4a8>

In [74]:
from sklearn import metrics

pred_test_y = model1.predict([X_test], batch_size=1024, verbose=1)




In [75]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print('F1 score at threshold {} is {}'.format(thresh, metrics.f1_score(y_test, (pred_test_y > thresh).astype(int))))

F1 score at threshold 0.1 is 0.597693436779199
F1 score at threshold 0.11 is 0.604396544350355
F1 score at threshold 0.12 is 0.6106052023876954
F1 score at threshold 0.13 is 0.6145053387089628
F1 score at threshold 0.14 is 0.6187367757619412
F1 score at threshold 0.15 is 0.6233255611941664
F1 score at threshold 0.16 is 0.6276600676897399
F1 score at threshold 0.17 is 0.6305062458908612
F1 score at threshold 0.18 is 0.6332207903371535
F1 score at threshold 0.19 is 0.6363505263663894
F1 score at threshold 0.2 is 0.6377206204113386
F1 score at threshold 0.21 is 0.640732716170967
F1 score at threshold 0.22 is 0.6436896105189759
F1 score at threshold 0.23 is 0.6454018644494836
F1 score at threshold 0.24 is 0.6475142624286878
F1 score at threshold 0.25 is 0.6487516087516086
F1 score at threshold 0.26 is 0.6489859594383776
F1 score at threshold 0.27 is 0.6489467878342176
F1 score at threshold 0.28 is 0.6499894000423998
F1 score at threshold 0.29 is 0.6510009634942725
F1 score at threshold 0.3

In [None]:
pred_submission_y = model1.predict([X_submission], batch_size=1024, verbose=1)
pred_submission_y = (pred_submission_y > 0.29).astype(int)

df_submission = pd.DataFrame({'qid': df_test['qid'].values})
df_submission['prediction'] = pred_submission_y
df_submission.to_csv("submission.csv", index=False)

In [68]:



inp = Input(shape=(maxlen,))
layer = Embedding(max_features, embed_size)(inp)
layer = Bidirectional(LSTM(64, return_sequences=True))(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dense(16, activation="relu")(layer)
layer = Dropout(0.1)(layer)
layer = Dense(1, activation="sigmoid")(layer)
model = Model(inputs=inp, outputs=layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

W0325 16:52:51.813960 22872 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x000002630280D3C8>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0325 16:52:51.838358 22872 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x0000026300FD47B8>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_17 (Embedding)     (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_19 (Bidirectio (None, 100, 128)          186880    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 128)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 16)                2064      
_________________________________________________________________
dropout_22 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 17  

In [29]:
model.fit(X_train, y_train, batch_size=512, epochs=2, validation_data=(X_test, y_test))

ValueError: Error when checking input: expected bidirectional_3_input to have 3 dimensions, but got array with shape (1175509, 100)

In [15]:
from sklearn import metrics

pred_test_y = model.predict([X_test], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print('F1 score at threshold {} is {}'.format(thresh, metrics.f1_score(y_test, (pred_test_y > thresh).astype(int))))

F1 score at threshold 0.1 is 0.5689291572287746
F1 score at threshold 0.11 is 0.5793673068441522
F1 score at threshold 0.12 is 0.5877956636005256
F1 score at threshold 0.13 is 0.5941615636272125
F1 score at threshold 0.14 is 0.600350382429603
F1 score at threshold 0.15 is 0.6048229415598523
F1 score at threshold 0.16 is 0.610654651470978
F1 score at threshold 0.17 is 0.6154812654251739
F1 score at threshold 0.18 is 0.621152004378164
F1 score at threshold 0.19 is 0.624618514750763
F1 score at threshold 0.2 is 0.6282183316168899
F1 score at threshold 0.21 is 0.6308216584627035
F1 score at threshold 0.22 is 0.6355346515645997
F1 score at threshold 0.23 is 0.6385705962322781
F1 score at threshold 0.24 is 0.6406004709576137
F1 score at threshold 0.25 is 0.6432394924662965
F1 score at threshold 0.26 is 0.6452484611920133
F1 score at threshold 0.27 is 0.6482960865608252
F1 score at threshold 0.28 is 0.6500587064168665
F1 score at threshold 0.29 is 0.651009136426986
F1 score at threshold 0.3 i

In [17]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print('F1 score at threshold {} is {}'.format(thresh, metrics.f1_score(y_test, (pred_test_y > thresh).astype(int))))

F1 score at threshold 0.1 is 0.5689291572287746
F1 score at threshold 0.11 is 0.5793673068441522
F1 score at threshold 0.12 is 0.5877956636005256
F1 score at threshold 0.13 is 0.5941615636272125
F1 score at threshold 0.14 is 0.600350382429603
F1 score at threshold 0.15 is 0.6048229415598523
F1 score at threshold 0.16 is 0.610654651470978
F1 score at threshold 0.17 is 0.6154812654251739
F1 score at threshold 0.18 is 0.621152004378164
F1 score at threshold 0.19 is 0.624618514750763
F1 score at threshold 0.2 is 0.6282183316168899
F1 score at threshold 0.21 is 0.6308216584627035
F1 score at threshold 0.22 is 0.6355346515645997
F1 score at threshold 0.23 is 0.6385705962322781
F1 score at threshold 0.24 is 0.6406004709576137
F1 score at threshold 0.25 is 0.6432394924662965
F1 score at threshold 0.26 is 0.6452484611920133
F1 score at threshold 0.27 is 0.6482960865608252
F1 score at threshold 0.28 is 0.6500587064168665
F1 score at threshold 0.29 is 0.651009136426986
F1 score at threshold 0.3 i

# Importing Embeddings

In [14]:
# embdedding setup
# Source https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
embeddings_index = {}
f = open('../input/embeddings/glove.840B.300d/glove.840B.300d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


0it [00:00, ?it/s]
1084it [00:00, 10783.24it/s]
2166it [00:00, 10777.21it/s]
3197it [00:00, 10627.73it/s]
4245it [00:00, 10579.37it/s]
5230it [00:00, 10333.41it/s]
6285it [00:00, 10381.26it/s]
7261it [00:00, 10168.74it/s]
8304it [00:00, 10230.03it/s]
9368it [00:00, 10333.43it/s]
10398it [00:01, 10307.34it/s]
11400it [00:01, 10083.57it/s]
12389it [00:01, 9919.55it/s] 
13426it [00:01, 10035.86it/s]
14511it [00:01, 10259.82it/s]
15532it [00:01, 9964.07it/s] 
16558it [00:01, 10037.65it/s]
17590it [00:01, 10103.87it/s]
18600it [00:01, 9898.09it/s] 
19591it [00:01, 9842.95it/s]
20576it [00:02, 9800.36it/s]
21557it [00:02, 9688.55it/s]
22527it [00:02, 9536.44it/s]
23488it [00:02, 9544.36it/s]
24468it [00:02, 9604.72it/s]
25430it [00:02, 9538.40it/s]
26449it [00:02, 9710.26it/s]
27484it [00:02, 9878.78it/s]
28530it [00:02, 10044.44it/s]
29584it [00:02, 10172.62it/s]
30608it [00:03, 10175.55it/s]
31671it [00:03, 10293.03it/s]
32702it [00:03, 10132.65it/s]
33744it [00:03, 10201.29it/s]
34825it 

277657it [00:27, 9669.23it/s]
278704it [00:27, 9880.96it/s]
279697it [00:27, 9879.70it/s]
280688it [00:28, 9687.75it/s]
281683it [00:28, 9749.82it/s]
282691it [00:28, 9831.27it/s]
283717it [00:28, 9939.87it/s]
284713it [00:28, 9744.78it/s]
285690it [00:28, 9653.43it/s]
286657it [00:28, 9505.63it/s]
287610it [00:28, 9292.51it/s]
288542it [00:28, 9231.54it/s]
289467it [00:29, 9169.83it/s]
290456it [00:29, 9359.10it/s]
291476it [00:29, 9582.08it/s]
292437it [00:29, 9437.23it/s]
293432it [00:29, 9571.58it/s]
294397it [00:29, 9578.73it/s]
295357it [00:29, 9570.02it/s]
296325it [00:29, 9587.60it/s]
297299it [00:29, 9618.94it/s]
298262it [00:29, 9414.54it/s]
299205it [00:30, 9322.61it/s]
300222it [00:30, 9547.40it/s]
301179it [00:30, 9510.75it/s]
302149it [00:30, 9551.36it/s]
303160it [00:30, 9698.60it/s]
304194it [00:30, 9866.50it/s]
305221it [00:30, 9967.61it/s]
306255it [00:30, 10061.20it/s]
307263it [00:30, 9878.84it/s] 
308253it [00:30, 9812.26it/s]
309253it [00:31, 9848.67it/s]
310239it

553711it [00:54, 10460.58it/s]
554759it [00:55, 10465.30it/s]
555807it [00:55, 10316.90it/s]
556858it [00:55, 10356.84it/s]
557895it [00:55, 10297.32it/s]
558944it [00:55, 10338.99it/s]
560002it [00:55, 10393.92it/s]
561042it [00:55, 10113.36it/s]
562056it [00:55, 10046.76it/s]
563087it [00:55, 10108.58it/s]
564155it [00:55, 10256.68it/s]
565182it [00:56, 10039.25it/s]
566251it [00:56, 10210.49it/s]
567321it [00:56, 10336.74it/s]
568383it [00:56, 10402.33it/s]
569427it [00:56, 10407.48it/s]
570469it [00:56, 10129.48it/s]
571485it [00:56, 8947.94it/s] 
572407it [00:56, 8635.70it/s]
573292it [00:56, 8200.71it/s]
574266it [00:57, 8607.50it/s]
575327it [00:57, 9111.72it/s]
576401it [00:57, 9544.77it/s]
577411it [00:57, 9701.17it/s]
578482it [00:57, 9968.63it/s]
579543it [00:57, 10138.48it/s]
580567it [00:57, 10093.34it/s]
581596it [00:57, 10136.51it/s]
582615it [00:57, 10107.04it/s]
583654it [00:57, 10174.54it/s]
584719it [00:58, 10296.83it/s]
585759it [00:58, 10311.36it/s]
586844it [00:58

828541it [01:22, 10354.17it/s]
829579it [01:22, 10345.56it/s]
830616it [01:22, 10100.80it/s]
831629it [01:22, 9892.20it/s] 
832621it [01:22, 9798.57it/s]
833658it [01:22, 9961.94it/s]
834657it [01:22, 9537.48it/s]
835653it [01:23, 9645.61it/s]
836622it [01:23, 9532.51it/s]
837597it [01:23, 9593.16it/s]
838628it [01:23, 9797.00it/s]
839677it [01:23, 9980.25it/s]
840678it [01:23, 9957.96it/s]
841737it [01:23, 10124.15it/s]
842780it [01:23, 10198.21it/s]
843868it [01:23, 10376.59it/s]
844908it [01:23, 10338.38it/s]
845944it [01:24, 10327.34it/s]
846982it [01:24, 10326.66it/s]
848034it [01:24, 10367.96it/s]
849110it [01:24, 10467.45it/s]
850158it [01:24, 10422.99it/s]
851206it [01:24, 10423.90it/s]
852249it [01:24, 9927.14it/s] 
853247it [01:24, 9702.58it/s]
854237it [01:24, 9745.76it/s]
855304it [01:25, 9989.89it/s]
856342it [01:25, 10088.19it/s]
857418it [01:25, 10263.96it/s]
858475it [01:25, 10353.26it/s]
859554it [01:25, 10475.07it/s]
860604it [01:25, 10241.93it/s]
861631it [01:25, 100

1098808it [01:49, 10797.29it/s]
1099888it [01:49, 10748.45it/s]
1100963it [01:49, 10168.27it/s]
1102008it [01:49, 10234.07it/s]
1103081it [01:49, 10362.16it/s]
1104162it [01:49, 10477.68it/s]
1105254it [01:50, 10587.94it/s]
1106327it [01:50, 10614.64it/s]
1107397it [01:50, 10622.22it/s]
1108475it [01:50, 10646.56it/s]
1109557it [01:50, 10680.84it/s]
1110632it [01:50, 10697.92it/s]
1111703it [01:50, 10639.43it/s]
1112793it [01:50, 10713.43it/s]
1113865it [01:50, 10711.72it/s]
1114937it [01:50, 10709.94it/s]
1116009it [01:51, 10677.56it/s]
1117077it [01:51, 10660.97it/s]
1118157it [01:51, 10686.36it/s]
1119235it [01:51, 10697.96it/s]
1120314it [01:51, 10707.27it/s]
1121385it [01:51, 10692.43it/s]
1122463it [01:51, 10700.40it/s]
1123558it [01:51, 10758.58it/s]
1124635it [01:51, 10743.82it/s]
1125723it [01:51, 10767.48it/s]
1126800it [01:52, 10751.15it/s]
1127876it [01:52, 10749.51it/s]
1128963it [01:52, 10769.64it/s]
1130065it [01:52, 10824.34it/s]
1131162it [01:52, 10846.02it/s]
1132250i

1370958it [02:15, 9455.46it/s]
1371910it [02:15, 9039.07it/s]
1372848it [02:15, 9124.64it/s]
1373825it [02:15, 9295.07it/s]
1374781it [02:16, 9358.46it/s]
1375829it [02:16, 9654.78it/s]
1376852it [02:16, 9805.52it/s]
1377883it [02:16, 9934.18it/s]
1378930it [02:16, 10074.80it/s]
1380010it [02:16, 10264.61it/s]
1381091it [02:16, 10405.65it/s]
1382146it [02:16, 10432.99it/s]
1383191it [02:16, 10329.86it/s]
1384226it [02:16, 10200.10it/s]
1385248it [02:17, 10043.25it/s]
1386276it [02:17, 10097.84it/s]
1387287it [02:17, 10055.33it/s]
1388338it [02:17, 10171.55it/s]
1389395it [02:17, 10273.28it/s]
1390441it [02:17, 10311.32it/s]
1391473it [02:17, 10238.91it/s]
1392498it [02:17, 10136.32it/s]
1393513it [02:17, 9566.47it/s] 
1394527it [02:17, 9714.59it/s]
1395540it [02:18, 9827.46it/s]
1396599it [02:18, 10029.93it/s]
1397615it [02:18, 10053.29it/s]
1398624it [02:18, 7257.99it/s] 
1399583it [02:18, 7819.02it/s]
1400456it [02:18, 7952.43it/s]
1401316it [02:18, 7463.45it/s]
1402219it [02:18, 786

1639065it [02:42, 10888.33it/s]
1640168it [02:42, 10912.81it/s]
1641271it [02:42, 10929.48it/s]
1642365it [02:42, 10852.39it/s]
1643451it [02:42, 10838.63it/s]
1644536it [02:42, 10760.85it/s]
1645619it [02:42, 10765.74it/s]
1646710it [02:42, 10791.64it/s]
1647790it [02:43, 10561.07it/s]
1648848it [02:43, 10562.27it/s]
1649906it [02:43, 10369.23it/s]
1650945it [02:43, 10036.32it/s]
1652011it [02:43, 10199.31it/s]
1653036it [02:43, 10197.19it/s]
1654127it [02:43, 10396.13it/s]
1655170it [02:43, 10373.73it/s]
1656210it [02:43, 10304.23it/s]
1657273it [02:44, 10384.55it/s]
1658313it [02:44, 10312.55it/s]
1659346it [02:44, 9896.89it/s] 
1660340it [02:44, 9532.98it/s]
1661314it [02:44, 9579.22it/s]
1662369it [02:44, 9835.56it/s]
1663358it [02:44, 9168.92it/s]
1664423it [02:44, 9554.44it/s]
1665483it [02:44, 9832.33it/s]
1666517it [02:44, 9963.16it/s]
1667602it [02:45, 10199.17it/s]
1668629it [02:45, 9523.55it/s] 
1669596it [02:45, 8447.66it/s]
1670566it [02:45, 8774.61it/s]
1671568it [02:45,

1912045it [03:08, 10541.56it/s]
1913125it [03:08, 10602.56it/s]
1914190it [03:08, 10600.00it/s]
1915291it [03:08, 10702.11it/s]
1916369it [03:09, 10709.70it/s]
1917441it [03:09, 10695.85it/s]
1918522it [03:09, 10712.03it/s]
1919594it [03:09, 10697.60it/s]
1920665it [03:09, 10622.26it/s]
1921740it [03:09, 10659.44it/s]
1922807it [03:09, 10084.70it/s]
1923875it [03:09, 10240.58it/s]
1924911it [03:09, 10259.98it/s]
1925961it [03:09, 10314.92it/s]
1927033it [03:10, 10418.31it/s]
1928119it [03:10, 10530.83it/s]
1929174it [03:10, 10411.79it/s]
1930232it [03:10, 10458.65it/s]
1931314it [03:10, 10537.33it/s]
1932379it [03:10, 10569.76it/s]
1933437it [03:10, 9402.49it/s] 
1934484it [03:10, 9684.23it/s]
1935569it [03:10, 9991.84it/s]
1936585it [03:11, 9827.57it/s]
1937580it [03:11, 8918.75it/s]
1938615it [03:11, 9290.45it/s]
1939672it [03:11, 9626.87it/s]
1940727it [03:11, 9885.25it/s]
1941807it [03:11, 10115.72it/s]
1942830it [03:11, 9631.91it/s] 
1943829it [03:11, 9720.75it/s]
1944914it [03:11

2186341it [03:34, 10660.61it/s]
2187408it [03:34, 10616.72it/s]
2188470it [03:34, 10598.49it/s]
2189561it [03:35, 10673.54it/s]
2190643it [03:35, 10700.32it/s]
2191714it [03:35, 10209.01it/s]
2192793it [03:35, 10360.92it/s]
2193857it [03:35, 10437.22it/s]
2194919it [03:35, 10476.37it/s]
2195975it [03:35, 10484.00it/s]
2196017it [03:35, 10181.93it/s]

Found 2196016 word vectors.


In [54]:
t = sorted(embeddings_index.items(), key=lambda x:-x[1])[:3]

for x in t:
    print ("{0}: {1}".format(*x))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [48]:
embeddings_index.most_common()

AttributeError: 'dict' object has no attribute 'most_common'

In [34]:
# Convert values to embeddings
def text_to_array(text):
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:30]
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (30 - len(embeds))
    return np.array(embeds)

# train_vects = [text_to_array(X_text) for X_text in tqdm(train_df["question_text"])]
val_vects = np.array([text_to_array(X_text) for X_text in tqdm(val_df["question_text"][:3000])])
val_y = np.array(val_df["target"][:3000])


  0%|                                                                                         | 0/3000 [00:00<?, ?it/s]
  0%|▏                                                                                | 8/3000 [00:00<00:39, 76.05it/s]
  1%|▋                                                                               | 24/3000 [00:00<00:33, 90.08it/s]
  2%|█▎                                                                             | 51/3000 [00:00<00:26, 112.52it/s]
  3%|██▎                                                                            | 86/3000 [00:00<00:20, 141.15it/s]
  5%|███▊                                                                          | 145/3000 [00:00<00:15, 182.45it/s]
  7%|█████▌                                                                        | 216/3000 [00:00<00:11, 234.76it/s]
 11%|████████▎                                                                     | 320/3000 [00:00<00:08, 305.62it/s]
 14%|███████████▎                      

In [35]:
# Data providers
batch_size = 128

def batch_gen(train_df):
    n_batches = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            return text_arr, np.array(train_df["target"][i*batch_size:(i+1)*batch_size])

In [36]:
mg = batch_gen(train_df)

In [37]:
mg

(array([[[-0.46171999,  0.33236   , -0.29076999, ...,  0.39179999,
           0.35146999,  0.27331001],
         [-0.014531  , -0.071761  , -0.59626001, ..., -0.12443   ,
           0.63753998,  0.12262   ],
         [ 0.19410001,  0.22603001, -0.43764001, ...,  0.091957  ,
           0.38631999,  0.11736   ],
         ...,
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ]],
 
        [[-0.31887001,  0.1944    , -0.11692   , ...,  0.22766   ,
           0.4824    ,  0.35543001],
         [-0.13563   ,  0.33217001, -0.36019999, ..., -0.17296   ,
           0.21675999,  0.22205999],
         [ 0.085181  ,  0.50892001, -0.08828   , ...,  0.43678999,
           0.29784   , -0.08815   ],
         ...,
         [ 0.        ,  0.       

In [17]:
import tensorflow as tf

In [42]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [44]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [45]:
model.fit(x_train, y_train, epochs=5)

model.evaluate(x_test, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.08060977021257859, 0.9748]

In [15]:
import numpy as np

In [16]:

import tensorflow as tf


In [17]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

AttributeError: module 'tensorflow' has no attribute 'Session'

In [18]:
import sys, os, re, csv, codecs, numpy as np
import pandas as pd


In [19]:
np.mean(2)

2.0

In [41]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

AttributeError: module 'tensorflow' has no attribute 'get_default_session'

In [21]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [28]:
path = '../input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE=f'{path}embeddings/glove6b50d/glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [29]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [30]:
os.getcwd()

'C:\\Users\\Advait\\Downloads\\Data Science\\Kaggle\\Quora Insincere Questions NLP\\code'

In [31]:
TRAIN_DATA_FILE

'../input/train.csv'

In [32]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)



In [33]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [34]:
list_sentences_train = train["question_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

KeyError: "['toxic' 'severe_toxic' 'obscene' 'threat' 'insult' 'identity_hate'] not in index"

In [35]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train))
list_tokenized_train = tokenizer.texts_to_sequences(train)
list_tokenized_test = tokenizer.texts_to_sequences(test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [40]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

FileNotFoundError: [Errno 2] No such file or directory: '../input/embeddings/glove6b50d/glove.6B.50d.txt'

In [39]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

NameError: name 'embeddings_index' is not defined

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1);

In [None]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(f'{path}{comp}sample_submission.csv')
sample_submission[list_classes] = y_test