# Predict which Tweets are about real disasters and which ones are not

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_tweets = pd.read_csv('train.csv')
train_tweets = train_tweets.sample(frac = 1)
train_tweets.head()

Unnamed: 0,id,keyword,location,text,target
2808,4039,disaster,"Alexandria, VA",Four Technologies That Could Let Humans Surviv...,0
7142,10234,volcano,"Ted&Qz Inc, Ireland, Europe",@songhey89 well I'm also gay but girls like so...,0
5030,7173,mudslide,plymouth,@brobread looks like mudslide????,1
1001,1453,body%20bagging,,I'm not a Drake fan but I enjoy seeing him bod...,0
2601,3735,destroyed,"Boise, Idaho",70 years after #ABomb destroyd #HiroshimaÛÓ#B...,1


In [3]:
from keras.preprocessing.text import text_to_word_sequence
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def train_tweets_cleaning(doc):
    train_tweets['clean_text'] = doc.apply(lambda x : ' '.join(text_to_word_sequence(x)))
    train_tweets['clean_text'] = train_tweets['clean_text'].apply(lambda y : ' '.join([y for y in y.split() if not y in stop_words]))
    train_tweets['clean_text'] = train_tweets['clean_text'].apply(lambda z : ' '.join([z for z in z.split() if len(z) > 1]))
    train_tweets['clean_text'] = train_tweets['clean_text'].apply(lambda w : ' '.join([w for w in w.split() if w.isalpha()]))
    train_tweets['review_length'] = train_tweets['clean_text'].apply(lambda v : len([v for v in v.split()]))

train_tweets_cleaning(train_tweets['text'])
train_tweets.head()

Using TensorFlow backend.


Unnamed: 0,id,keyword,location,text,target,clean_text,review_length
2808,4039,disaster,"Alexandria, VA",Four Technologies That Could Let Humans Surviv...,0,four technologies could let humans survive env...,10
7142,10234,volcano,"Ted&Qz Inc, Ireland, Europe",@songhey89 well I'm also gay but girls like so...,0,well also gay girls like predict tsunami amp v...,14
5030,7173,mudslide,plymouth,@brobread looks like mudslide????,1,brobread looks like mudslide,4
1001,1453,body%20bagging,,I'm not a Drake fan but I enjoy seeing him bod...,0,drake fan enjoy seeing body bagging people gre...,10
2601,3735,destroyed,"Boise, Idaho",70 years after #ABomb destroyd #HiroshimaÛÓ#B...,1,years abomb destroyd bbc looks wht survived ht...,17


In [4]:
X_independent = train_tweets['clean_text']
Y_target = train_tweets['target']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_independent, Y_target, test_size=0.33, random_state=42)

In [6]:
length = max(train_tweets['review_length'])
print(length)

23


In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def create_tokenizer(X_train_lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train_lines)
    return tokenizer
# encode a list of lines
def encode_text(tokenizer,X_train_lines,length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(X_train_lines)
    # pad encoded sequences
    padded = pad_sequences(encoded,maxlen = length,padding = 'post')
    return padded

tokenizer = create_tokenizer(X_train)
X_train_padded = encode_text(tokenizer,X_train,length)
print(X_train_padded)

[[ 2308  4739  2309 ...     0     0     0]
 [  580   144   161 ...     0     0     0]
 [ 4741    86   187 ...     0     0     0]
 ...
 [  178    52 12585 ...     0     0     0]
 [  922   394   364 ...     0     0     0]
 [12588  4717  1225 ...     0     0     0]]


In [12]:
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

# define the model
def define_model(length,vocab_size):
    # channel 1
    inputs1 = Input(shape = (length,))
    embedding1 = Embedding(vocab_size,100)(inputs1)
    conv1 = Conv1D(filters = 32, kernel_size = 4, activation = 'relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size = 2)(drop1)
    flat1 = Flatten()(pool1)
    
    # channel 2
    inputs2 = Input(shape = (length,))
    embedding2 = Embedding(vocab_size,100)(inputs2)
    conv2 = Conv1D(filters = 32, kernel_size = 4, activation = 'relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size = 2)(drop2)
    flat2 = Flatten()(pool2)
    
    # channel 1
    inputs3 = Input(shape = (length,))
    embedding3 = Embedding(vocab_size,100)(inputs3)
    conv3 = Conv1D(filters = 32, kernel_size = 4, activation = 'relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size = 2)(drop3)
    flat3 = Flatten()(pool3)
    
    # merge
    merged = concatenate([flat1,flat2,flat3])
    # interpretation
    dense1 = Dense(10,activation = 'relu')(merged)
    outputs = Dense(1, activation = 'sigmoid')(dense1)
    model = Model(inputs = [inputs1,inputs2,inputs3], outputs = outputs)
    # compile
    model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    # summarize
    model.summary()
    #plot_model(model,show_shapes = True,to_file = 'multichannel.png')
    return model

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# define model
model = define_model(length,vocab_size)

# fit model
model.fit([X_train_padded,X_train_padded,X_train_padded],y_train, epochs = 3, verbose = 2)

Vocabulary Size: 12589
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 23)           0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 23)           0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 23)           0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 23, 100)      1258900     input_7[0][0]                    
_____________________________________________________________________

<keras.callbacks.callbacks.History at 0x1953fc1adc8>

In [13]:
# evaluate model on test dataset
X_test_padded = encode_text(tokenizer,X_test,length)

_,acc = model.evaluate([X_train_padded,X_train_padded,X_train_padded],y_train,verbose = 0)
print('Train Accuracy: %.2f' %(acc*100))
_,acc = model.evaluate([X_test_padded,X_test_padded,X_test_padded],y_test,verbose = 0)
print('Test Accuracy: %.2f' %(acc*100))

Train Accuracy: 98.27
Test Accuracy: 78.39


# Predictions on Test Data

In [14]:
test_tweets = pd.read_csv('test.csv')
test_tweets.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [15]:
def test_tweets_cleaning(doc_test_data):
    test_tweets['clean_text'] = doc_test_data.apply(lambda x : ' '.join(text_to_word_sequence(x)))
    test_tweets['clean_text'] = test_tweets['clean_text'].apply(lambda y : ' '.join([y for y in y.split() if not y in stop_words]))
    test_tweets['clean_text'] = test_tweets['clean_text'].apply(lambda z : ' '.join([z for z in z.split() if len(z) > 1]))
    test_tweets['clean_text'] = test_tweets['clean_text'].apply(lambda w : ' '.join([w for w in w.split() if w.isalpha()]))
    test_tweets['review_length'] = test_tweets['clean_text'].apply(lambda v : len([v for v in v.split()]))

test_tweets_cleaning(test_tweets['text'])
test_tweets.head()

Unnamed: 0,id,keyword,location,text,clean_text,review_length
0,0,,,Just happened a terrible car crash,happened terrible car crash,4
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different cities stay safe ev...,7
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese fleeing across str...,10
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires,4
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills china taiwan,5


In [16]:
test_padded = encode_text(tokenizer,test_tweets['clean_text'],length)
print(test_padded)

[[ 744 2975   53 ...    0    0    0]
 [ 235  125 1027 ...    0    0    0]
 [  85    6  660 ...    0    0    0]
 ...
 [ 909  590  203 ...    0    0    0]
 [6654  310  229 ...    0    0    0]
 [4129 1699 1072 ...    0    0    0]]


In [17]:
yhat = model.predict([test_padded,test_padded,test_padded],verbose = 0)
# retrieve predicted percentage and label
percent_pos = yhat[0,0]
print(percent_pos)

0.7139897


In [18]:
yhat

array([[0.7139897 ],
       [0.92849684],
       [0.9702349 ],
       ...,
       [0.75614667],
       [0.907746  ],
       [0.8372651 ]], dtype=float32)

In [19]:
labels = []
for i in yhat:
    percent_pos = np.round(i[0])
    labels.append(percent_pos)

In [23]:
test_tweets['target'] = labels
test_tweets['target'] = test_tweets['target'].astype('int')
test_tweets.head()

Unnamed: 0,id,keyword,location,text,clean_text,review_length,label,target
0,0,,,Just happened a terrible car crash,happened terrible car crash,4,1,1
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different cities stay safe ev...,7,1,1
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese fleeing across str...,10,1,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires,4,0,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills china taiwan,5,1,1


In [24]:
sample_submission = test_tweets[['id','target']]
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [25]:
sample_submission.to_csv('submission.csv',index = False)