In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/glove-embeddings/glove.6B.200d.txt
/kaggle/input/glove-embeddings/glove.6B.50d.txt
/kaggle/input/glove-embeddings/glove.6B.300d.txt
/kaggle/input/glove-embeddings/glove.6B.100d.txt


In [2]:
train=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# **EDA**

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [6]:
train[train['target']==0]['text'].values[1]

'I love fruits'

In [7]:
train[train['target']==1]['text'].values[1]

'Forest fire near La Ronge Sask. Canada'

# **Data preprocessing**

Removing stopwords and punctuations

In [8]:
#finding punctuations
import string
import nltk
from nltk.corpus import stopwords
punctuations=list(string.punctuation)
stopwords=list(stopwords.words('english'))
remove=punctuations+stopwords

Converting words to their lemma form

In [9]:
from nltk.stem import WordNetLemmatizer
lemma=WordNetLemmatizer()


Defining a function to preprocess the messages

In [10]:
import re
def preprocess(tweet):
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", tweet) # removing urls 
    tweet = re.sub('[^\w]',' ',tweet) # remove embedded special characters in words (for example #earthquake)         
    tweet = re.sub('[\d]','',tweet) # this will remove numeric characters
    tweet = tweet.lower()
    words = tweet.split()  
    sentence=''
    for word in words:
        if word not in remove:
            word=lemma.lemmatize(word,pos='v')
            sentence+=word+' '
    return sentence
    

In [11]:
train['text']=train['text'].apply(lambda s: preprocess(s))
test['text']=test['text'].apply(lambda s: preprocess(s))

Remove emojis

In [12]:
def remove_emojis(tweets):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', tweets)

In [13]:
train['text']=train['text'].apply(lambda s: remove_emojis(s))
test['text']=test['text'].apply(lambda s: remove_emojis(s))

In [14]:
X=train.text
y=train.target

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

In [16]:
# Bag of Words model
from keras.preprocessing.text import Tokenizer

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)    #internal vocabulary is updated based on the texts
    return tokenizer

In [17]:
tokenizer=create_tokenizer(X_train)
X_train_set=tokenizer.texts_to_matrix(X_train, mode = 'freq')

In [18]:
X_test_set=tokenizer.texts_to_matrix(X_test, mode = 'freq')

# **NN trial**

In [19]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
model=tf.keras.Sequential()
model.add(tf.keras.layers.Dense(128,input_shape=((X_train_set.shape[1]),),activation='relu'))
#model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [20]:
model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=['accuracy'])

In [21]:
epochs = 10
history = model.fit(X_train_set,y_train,epochs=epochs, verbose = 2)

Epoch 1/10
191/191 - 2s - loss: 0.6521 - accuracy: 0.6186
Epoch 2/10
191/191 - 1s - loss: 0.5013 - accuracy: 0.8056
Epoch 3/10
191/191 - 0s - loss: 0.3815 - accuracy: 0.8544
Epoch 4/10
191/191 - 0s - loss: 0.3141 - accuracy: 0.8826
Epoch 5/10
191/191 - 0s - loss: 0.2601 - accuracy: 0.9074
Epoch 6/10
191/191 - 0s - loss: 0.2192 - accuracy: 0.9245
Epoch 7/10
191/191 - 0s - loss: 0.1828 - accuracy: 0.9394
Epoch 8/10
191/191 - 0s - loss: 0.1564 - accuracy: 0.9499
Epoch 9/10
191/191 - 0s - loss: 0.1337 - accuracy: 0.9576
Epoch 10/10
191/191 - 1s - loss: 0.1157 - accuracy: 0.9640


In [22]:
model.evaluate(X_test_set,y_test)



[0.5888469815254211, 0.7715036273002625]

In [23]:
y_pred = model.predict_classes(X_test_set)



In [24]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.82      0.81       882
           1       0.74      0.70      0.72       641

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523



Problem with bag of words is that it doesnt take order into account.

# **GloVe with keras word embeddings**

In [25]:
#https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/ -->reference
#prepare the tokenizer
t=Tokenizer()
t.fit_on_texts(X_train.tolist())
vocab_size=len(t.word_index)+1

In [26]:
#load the whole embedding into memory
embeddings_index=dict()
f = open('/kaggle/input/glove-embeddings/glove.6B.100d.txt', mode='rt', encoding='utf-8')
for line in f:
    values=line.split()
    word=values[0]
    coefs=np.asarray(values[1:],dtype='float32')
    embeddings_index[word]=coefs
f.close()
    


In [27]:
#integer encode the words
from keras.preprocessing.sequence import pad_sequences
encoded_docs=t.texts_to_sequences(X_train.tolist())
maxlen=100
padded_docs=pad_sequences(encoded_docs,maxlen=maxlen,padding='post')


In [28]:
print(padded_docs)

[[ 4857   145     4 ...     0     0     0]
 [ 4860   491   223 ...     0     0     0]
 [ 4862  1079    56 ...     0     0     0]
 ...
 [ 3663    18    16 ...     0     0     0]
 [ 1713   455 12459 ...     0     0     0]
 [12460 12461   207 ...     0     0     0]]


In [29]:
emb_matrix=np.zeros((vocab_size,100))
for word,i in t.word_index.items():
    embed_vector=embeddings_index.get(word)
    if embed_vector is not None:
        emb_matrix[i]=embed_vector
    

In [30]:
#defining model
model2=tf.keras.Sequential()
model2.add(tf.keras.layers.Embedding(vocab_size,100,weights=[emb_matrix],input_length=100,trainable=False))
model2.add(tf.keras.layers.Flatten())
model2.add(tf.keras.layers.Dense(128,activation='relu'))
model2.add(tf.keras.layers.Dropout(0.5))
model2.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [31]:
model2.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=['accuracy'])

In [32]:
model2.fit(padded_docs,y_train,epochs=50,verbose=2)

Epoch 1/50
191/191 - 1s - loss: 0.5104 - accuracy: 0.7652
Epoch 2/50
191/191 - 0s - loss: 0.4080 - accuracy: 0.8223
Epoch 3/50
191/191 - 0s - loss: 0.3538 - accuracy: 0.8484
Epoch 4/50
191/191 - 0s - loss: 0.3119 - accuracy: 0.8677
Epoch 5/50
191/191 - 0s - loss: 0.2674 - accuracy: 0.8923
Epoch 6/50
191/191 - 0s - loss: 0.2363 - accuracy: 0.9125
Epoch 7/50
191/191 - 0s - loss: 0.1996 - accuracy: 0.9271
Epoch 8/50
191/191 - 0s - loss: 0.1746 - accuracy: 0.9368
Epoch 9/50
191/191 - 0s - loss: 0.1535 - accuracy: 0.9470
Epoch 10/50
191/191 - 0s - loss: 0.1442 - accuracy: 0.9535
Epoch 11/50
191/191 - 0s - loss: 0.1303 - accuracy: 0.9542
Epoch 12/50
191/191 - 0s - loss: 0.1232 - accuracy: 0.9596
Epoch 13/50
191/191 - 1s - loss: 0.1200 - accuracy: 0.9614
Epoch 14/50
191/191 - 1s - loss: 0.1074 - accuracy: 0.9667
Epoch 15/50
191/191 - 0s - loss: 0.1103 - accuracy: 0.9670
Epoch 16/50
191/191 - 0s - loss: 0.0955 - accuracy: 0.9688
Epoch 17/50
191/191 - 0s - loss: 0.0955 - accuracy: 0.9703
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7f7a505ed910>

In [33]:
encoded_docs_test = t.texts_to_sequences(X_test.tolist())
padded_docs_test = pad_sequences(encoded_docs_test,maxlen=maxlen, padding='post')

In [34]:
model2.evaluate(padded_docs_test,y_test)



[1.0441800355911255, 0.7767564058303833]

# **CNN with word embeddings**


In [35]:
#finding maximum length of tweets in training set
maxlength=max(len(s) for s in train.text)
print(maxlength)

138


In [36]:
#do the encoding 
encoded_docs_3=t.texts_to_sequences(X_train.tolist())
#padding
padded_docs_3=pad_sequences(encoded_docs_3,maxlen=maxlength,padding='post')
print(padded_docs)

[[ 4857   145     4 ...     0     0     0]
 [ 4860   491   223 ...     0     0     0]
 [ 4862  1079    56 ...     0     0     0]
 ...
 [ 3663    18    16 ...     0     0     0]
 [ 1713   455 12459 ...     0     0     0]
 [12460 12461   207 ...     0     0     0]]


In [37]:
model3=tf.keras.Sequential()
model3.add(tf.keras.layers.Embedding(vocab_size,100,input_length=maxlength))
model3.add(tf.keras.layers.Conv1D(filters=32,kernel_size=8,activation='relu'))
model3.add(tf.keras.layers.MaxPooling1D(pool_size=2))
model3.add(tf.keras.layers.Flatten())

model3.add(tf.keras.layers.Dense(10,activation='relu'))
model3.add(tf.keras.layers.Dropout(0.5))
model3.add(tf.keras.layers.Dense(1,activation='sigmoid'))


In [38]:
model3.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=['accuracy'])

In [39]:
model3.fit(padded_docs_3,y_train,epochs=50,verbose=2)

Epoch 1/50
191/191 - 8s - loss: 0.6685 - accuracy: 0.5662
Epoch 2/50
191/191 - 2s - loss: 0.5748 - accuracy: 0.6946
Epoch 3/50
191/191 - 2s - loss: 0.4546 - accuracy: 0.7796
Epoch 4/50
191/191 - 2s - loss: 0.3487 - accuracy: 0.8005
Epoch 5/50
191/191 - 2s - loss: 0.2618 - accuracy: 0.8251
Epoch 6/50
191/191 - 2s - loss: 0.1987 - accuracy: 0.8982
Epoch 7/50
191/191 - 2s - loss: 0.1826 - accuracy: 0.9271
Epoch 8/50
191/191 - 2s - loss: 0.1658 - accuracy: 0.9286
Epoch 9/50
191/191 - 2s - loss: 0.1508 - accuracy: 0.9414
Epoch 10/50
191/191 - 2s - loss: 0.1257 - accuracy: 0.9578
Epoch 11/50
191/191 - 2s - loss: 0.1203 - accuracy: 0.9609
Epoch 12/50
191/191 - 2s - loss: 0.1133 - accuracy: 0.9578
Epoch 13/50
191/191 - 2s - loss: 0.1113 - accuracy: 0.9609
Epoch 14/50
191/191 - 2s - loss: 0.1062 - accuracy: 0.9614
Epoch 15/50
191/191 - 2s - loss: 0.1087 - accuracy: 0.9586
Epoch 16/50
191/191 - 3s - loss: 0.1105 - accuracy: 0.9573
Epoch 17/50
191/191 - 2s - loss: 0.1083 - accuracy: 0.9560
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7f75f032f750>

In [40]:
encoded_docs_test_3 = t.texts_to_sequences(X_test.tolist())
padded_docs_test_3 = pad_sequences(encoded_docs_test,maxlen=maxlength, padding='post')

In [41]:
model3.evaluate(padded_docs_test_3,y_test)



[1.9582983255386353, 0.7649376392364502]

# **BERT**

In [42]:
#https://www.kaggle.com/abhinand05/bert-for-humans-tutorial-baseline-version-2
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py


In [43]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tensorflow as tf

import tokenization

In [44]:
train3=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

In [45]:
def bertencode(texts,tokenizer,maxlen=512):
    alltokens=[]
    masks=[]
    segments=[]
    for text in texts:
        text=tokenizer.tokenize(text)
        text=text[:maxlen-2]
        inpseq=["[CLS]"] +text+["[SEP]"]
        padlen=maxlen-len(inpseq)
        tokens = tokenizer.convert_tokens_to_ids(inpseq)
        tokens+=[0]*padlen
        padmasks=[1]*len(inpseq)+[0]*padlen
        segid=[0]*maxlen
        alltokens.append(tokens)
        masks.append(padmasks)
        segments.append(segid)
    return np.array(alltokens),np.array(masks),np.array(segments)
        
        

In [46]:
def buildmodel(bert_layer,maxlen=512):
    inputwordids=Input(shape=(maxlen,),dtype=tf.int32,name='input_word_ids')
    inputmask=Input(shape=(maxlen,),dtype=tf.int32,name='input_mask')
    segmentids=Input(shape=(maxlen,),dtype=tf.int32,name='segment_ids')
    _,seq_output=bert_layer([inputwordids,inputmask,segmentids])
    clf_output=seq_output[:,0,:]
    out=Dense(1,activation='sigmoid')(clf_output)
    model= Model(inputs=[inputwordids, inputmask, segmentids], outputs=out)
    model.compile(Adam(lr=2e-6),loss='binary_crossentropy',metrics=['accuracy'])
    return model
    
    

In [47]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 23 s, sys: 4.69 s, total: 27.6 s
Wall time: 28.1 s


In [48]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

In [49]:

tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [50]:
train_input = bertencode(train3.text.values, tokenizer, maxlen=160)
#test_input = bert_encode(test.text.values, tokenizer, max_len=160)
train_labels = train3.target.values

In [51]:
model4= buildmodel(bert_layer, maxlen=160)

In [52]:
history=model4.fit(train_input, train_labels,validation_split=0.2,epochs=3,batch_size=16)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [53]:
model4.evaluate(train_input,train_labels)



[0.23269380629062653, 0.912912130355835]