In [1]:
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from keras.layers import Input, Dense, Embedding, LSTM, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
pd.set_option('display.max_colwidth', -1)

Using TensorFlow backend.
  # Remove the CWD from sys.path while we load stuff.


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.shape, test.shape

((7613, 5), (3263, 4))

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [4]:
embedded_size = 100
max_features = 10000
maxlen = 100

In [5]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train,test_size = 0.1 , random_state = 43)
train_X = train_df.text.values
test_X = test.text.values
val_X = val_df.text.values

In [6]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

In [7]:
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

In [8]:
train_y = train_df.target.values
val_y = val_df.target.values

In [9]:
from keras.models import Model

inp = Input(shape = (maxlen,))
x = Embedding(max_features,embedded_size)(inp)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(1,activation = 'sigmoid')(x)
model=Model(inputs = inp,outputs = x)
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          84480     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17  

In [10]:
model.fit(train_X, train_y, batch_size=512, epochs=10, validation_data=(val_X, val_y))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6851 samples, validate on 762 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1fac104a088>

In [11]:
preds = model.predict([test_X],batch_size = 1024,verbose = 1)
predictions = (preds > 0.5).astype(int)
predictions = np.ndarray.flatten(predictions)



In [12]:
target = pd.read_csv('socialmedia-disaster-tweets.csv')
target = target[['choose_one', 'text']]
target['target'] = (target['choose_one']=='Relevant').astype(int)
target['id'] = target.index
merged_df = pd.merge(test, target, on='id')

In [13]:
real = []
text = []
pred = []
for i in range(10):
  rand= np.random.randint(0,len(preds))
  real.append(merged_df['target'][rand])
  text.append(test['text'][rand])
  pred.append(preds[rand] * 100)

dataset = pd.DataFrame({'Real':real, 'Pred': pred, 'Text':text})
dataset.head(10)

Unnamed: 0,Real,Pred,Text
0,1,[99.3734],Property losses from northern CA wildfire nearly double http://t.co/fHqx7FiIVJ If you pray please remember CA people/forests/wildlife/water
1,1,[88.29204],[10:45:27JST Aug06:First Alert] M4.1 at 'E off Chiba pref.' under 10km (35.8140.8). Estimated max seismic# is 3
2,1,[95.834206],#CityofCalgary has activated its Municipal Emergency Plan. #yycstorm
3,0,[7.595764],@FlameCored colliding with her projectile as a cloud of smoke engulfed the area. Not a moment later Shadow propelled himself through it --
4,1,[0.31027588],Here's a quick timelapse I made of the Finger Rock Fire last night from about 9PM - 1AM. Check it out! #fingerrockfire #wildfire #catalinas
5,1,[21.13433],The South Korean army wounded 44 persons' Takeshima's Japanese fisherman and occupies the island. \nhttp://t.co/mJCXgKU8Yt
6,0,[1.2134051],Click Share in minute info-technews The Trouble With RedditÛªs Content Policy Update | Re/code Û_ http://t.co/wS0xohNb7v
7,0,[94.12823],@lizXy_ IMAGINE IF AN EARTHQUAKE HAPPENED
8,1,[99.26201],Wolverine Fire Update - Thursday August 6 - 9:00 Am\n\nIncident: Wolverine Fire Wildfire\nReleased: 41 min. ago... http://t.co/8WDTTzpTXH
9,1,[2.4647276],#BreakingNews Mva / Multiple Injuries - Saint Petersburg FL: E7 requesting R7 &amp; 2 sunstar units for injuriesÛ_ http://t.co/0XRysEpQhL


In [14]:
from sklearn.metrics import accuracy_score
acc = merged_df['target'].values.tolist()
acc = accuracy_score(acc, np.round(preds))*100
print(acc)

77.75053631627337
