This is my first submission in Kaggle. I am using LSTM with an Embeddding Layer. I have included GloVe6b50.txt for word2vector conversion. I have commented on each step of what I'm doing in the code. Feel free to comment/suggest/point out my mistakes. Cheers!

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/glove6b50dtxt/glove.6B.50d.txt


In [2]:
#Importing the Libraries

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from sklearn.metrics import roc_auc_score

import keras.backend as K

Using TensorFlow backend.


In [3]:
#I've added glove vectors in the input. https://www.kaggle.com/rtatman/glove-global-vectors-for-word-representation

#Loading the word vectors to convert the words from the tweets to vector format

print('Loading word vectors...')
word2vec = {}
with open(os.path.join('../input/glove6b50dtxt/glove.6B.50d.txt'), encoding = "utf-8") as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split() #split at space
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32') #numpy.asarray()function is used when we want to convert input to an array.
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))


Loading word vectors...
Found 400000 word vectors.


In [4]:
#Reading the tweets to dataframe

print('Loading in tweets...')

train = pd.read_csv("../input/nlp-getting-started/train.csv")
train.head()

Loading in tweets...


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
#Removing (dropping) the columns 'keyword' and 'location'as we're concerned with the tweets' text

train = train.drop(["keyword","location"],axis=1)
train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
#Storing the values of tweets and target in respective variables 

tweets = train["text"].values
target = train["target"].values


In [7]:
#Tokenizing the words

tokenizer = Tokenizer(num_words=20000) #vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf.
tokenizer.fit_on_texts(tweets) #Updates internal vocabulary based on a list of texts.
sequences = tokenizer.texts_to_sequences(tweets) #Converts a text to a sequence of words (or tokens).


In [8]:
#Creating an array for indexing each word 

word2idx = tokenizer.word_index #indexing each word from vector list
print('Found %s unique tokens.' % len(word2idx))

data = pad_sequences(sequences,100) #padding each tweet vector with 0s to a uniform length of 100
print('Shape of data tensor:', data.shape)


Found 22700 unique tokens.
Shape of data tensor: (7613, 100)


In [9]:
print('Filling pre-trained embeddings...')
num_words = min(20000, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, 50)) #fill array embedding_matrix with 0s with size num_words, embedding_matrix i.e. 20000,50


Filling pre-trained embeddings...


In [10]:
#Creating an embedding matrix to create the embedding layer for LSTM

embedding1=[]
for word, i in word2idx.items():
    if i < 20000:
        embedding1 = word2vec.get(word)
        if embedding1 is not None:
            embedding_matrix[i] = embedding1


In [11]:
#Embedding layer

embedding_layer = Embedding( #Turns positive integers (indexes) into dense vectors of fixed size.
  num_words,
  50,
  weights=[embedding_matrix],
  input_length=100,
  trainable=False
)


In [12]:
#Creating the model

print('Building model...')

# create an LSTM network with a single LSTM
input_ = Input(shape=(100,))
x = embedding_layer(input_)
x = Bidirectional(LSTM(15, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
output = Dense(1, activation="sigmoid")(x)

model = Model(input_, output)
model.compile(
  loss='binary_crossentropy',
  optimizer=Adam(lr=0.01),
  metrics=['accuracy'],
)
model.summary()

Building model...
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 50)           1000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 30)           7920      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 30)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 31        
Total params: 1,007,951
Trainable params: 7,951
Non-trainable params: 1,000,000
_________________________________________________________________


In [13]:
#training the model

print('Training model...')
r = model.fit(
  data,
  target,
  batch_size=128,
  epochs=100,
  validation_split=0.2
)

print("Done with the Training")


Training model...
Train on 6090 samples, validate on 1523 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 7

In [14]:
#Repeating the steps for test dataset to predict the values obtained from the model training

print("Loading in the test dataset\n")

test = pd.read_csv("../input/nlp-getting-started/test.csv")
test.head()
test = test.drop(["keyword","location"],axis=1)
tweets_test = test["text"].values

tokenizer = Tokenizer(num_words=20000) #vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf.
tokenizer.fit_on_texts(tweets_test) #Updates internal vocabulary based on a list of texts.
sequences = tokenizer.texts_to_sequences(tweets_test) #Converts a text to a sequence of words (or tokens).
word2idx = tokenizer.word_index #indexing each word from vector list
print('Found %s unique tokens.' % len(word2idx))

data = pad_sequences(sequences,100)
print('Shape of data tensor:', data.shape)


print("Predictions:\n\n")
test['target'] = model.predict(data) #predicting the data
test.head()


Loading in the test dataset

Found 12818 unique tokens.
Shape of data tensor: (3263, 100)
Predictions:




Unnamed: 0,id,text,target
0,0,Just happened a terrible car crash,0.996499
1,2,"Heard about #earthquake is different cities, s...",3e-06
2,3,"there is a forest fire at spot pond, geese are...",0.997566
3,9,Apocalypse lighting. #Spokane #wildfires,0.900481
4,11,Typhoon Soudelor kills 28 in China and Taiwan,0.000174


In [15]:
#Storing the contents of the test dataset into a csv file

import csv
test = test.drop(["text"],axis=1)
test.to_csv("sample_submission.csv",index=False)

In [16]:
#Reading the csv file

sub = pd.read_csv("sample_submission.csv")
sub.head()

Unnamed: 0,id,target
0,0,0.996499
1,2,3e-06
2,3,0.997566
3,9,0.900481
4,11,0.000174
