In [2]:
import re
import pandas as pd 
pd.set_option("display.max_colwidth", 200)
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [3]:
train  = pd.read_csv("project_train.csv")
test = pd.read_csv("project_test.csv")

In [4]:
train.shape , test.shape

((7613, 5), (3263, 4))

In [5]:
train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1


In [6]:
test.head(3)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"


In [7]:
train["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [8]:
# Combined Training and Testing Dataset
combi = train.append(test, ignore_index=True)
combi.shape

(10876, 5)

In [9]:
combi.tail(3)

Unnamed: 0,id,keyword,location,text,target
10873,10868,,,Green Line derailment in Chicago http://t.co/UtbXLcBIuY,
10874,10874,,,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3,
10875,10875,,,#CityofCalgary has activated its Municipal Emergency Plan. #yycstorm,


In [10]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

In [11]:
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['text'], "@[\w]*") 
combi.head(50)

Unnamed: 0,id,keyword,location,text,target,tidy_tweet
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1.0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1.0,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1.0,"13,000 people receive #wildfires evacuation orders in California"
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1.0,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1.0,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1.0,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas"
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1.0,I'm on top of the hill and I can see a fire in the woods...
8,14,,,There's an emergency evacuation happening now in the building across the street,1.0,There's an emergency evacuation happening now in the building across the street
9,15,,,I'm afraid that the tornado is coming to our area...,1.0,I'm afraid that the tornado is coming to our area...


In [12]:
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")
combi.head(10)

Unnamed: 0,id,keyword,location,text,target,tidy_tweet
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1.0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,Forest fire near La Ronge Sask Canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1.0,All residents asked to shelter in place are being notified by officers No other evacuation or shelter in place orders are expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1.0,people receive #wildfires evacuation orders in California
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1.0,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1.0,#RockyFire Update California Hwy closed in both directions due to Lake County fire #CAfire #wildfires
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1.0,#flood #disaster Heavy rain causes flash flooding of streets in Manitou Colorado Springs areas
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1.0,I m on top of the hill and I can see a fire in the woods
8,14,,,There's an emergency evacuation happening now in the building across the street,1.0,There s an emergency evacuation happening now in the building across the street
9,15,,,I'm afraid that the tornado is coming to our area...,1.0,I m afraid that the tornado is coming to our area


In [13]:
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [14]:
combi.head()

Unnamed: 0,id,keyword,location,text,target,tidy_tweet
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1.0,Deeds Reason this #earthquake ALLAH Forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,Forest fire near Ronge Sask Canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1.0,residents asked shelter place being notified officers other evacuation shelter place orders expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1.0,people receive #wildfires evacuation orders California
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1.0,Just sent this photo from Ruby #Alaska smoke from #wildfires pours into school


In [15]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) # tokenizing

In [16]:
tokenized_tweet.head(10)

0                                                                    [Deeds, Reason, this, #earthquake, ALLAH, Forgive]
1                                                                             [Forest, fire, near, Ronge, Sask, Canada]
2    [residents, asked, shelter, place, being, notified, officers, other, evacuation, shelter, place, orders, expected]
3                                                         [people, receive, #wildfires, evacuation, orders, California]
4                          [Just, sent, this, photo, from, Ruby, #Alaska, smoke, from, #wildfires, pours, into, school]
5                   [#RockyFire, Update, California, closed, both, directions, Lake, County, fire, #CAfire, #wildfires]
6                 [#flood, #disaster, Heavy, rain, causes, flash, flooding, streets, Manitou, Colorado, Springs, areas]
7                                                                                                   [hill, fire, woods]
8                                       

In [17]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

In [18]:
stemmer.stem('turning')

'turn'

In [19]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
combi['tidy_tweet'] = tokenized_tweet

In [20]:
combi['tidy_tweet'].head(10)

0                                           deed reason thi #earthquak allah forgiv
1                                                 forest fire near rong sask canada
2    resid ask shelter place be notifi offic other evacu shelter place order expect
3                                      peopl receiv #wildfir evacu order california
4        just sent thi photo from rubi #alaska smoke from #wildfir pour into school
5     #rockyfir updat california close both direct lake counti fire #cafir #wildfir
6      #flood #disast heavi rain caus flash flood street manit colorado spring area
7                                                                    hill fire wood
8                                      there emerg evacu happen build across street
9                                                     afraid that tornado come area
Name: tidy_tweet, dtype: object

In [21]:
from gensim.models import Word2Vec

In [22]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) # tokenizing

model_w2v = Word2Vec(tokenized_tweet, size=200,  window=5)

model_w2v.train(tokenized_tweet, total_examples= len(combi['tidy_tweet']), epochs=20)

(1508509, 2136220)

In [23]:
model_w2v.wv.most_similar(positive="fire")

[('grove', 0.7859987020492554),
 ('apart', 0.7441520690917969),
 ('hinton', 0.7425417900085449),
 ('bush', 0.7374840974807739),
 ('contain', 0.7318223714828491),
 ('alarm', 0.7290016412734985),
 ('complex', 0.7261528968811035),
 ('close', 0.7253020405769348),
 ('forest', 0.7159005999565125),
 ('ridg', 0.7118695974349976)]

In [24]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary
                         
            continue
    if count != 0:
        vec /= count
    return vec

In [25]:
wordvec_arrays = np.zeros((len(tokenized_tweet), 200))

for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 200)
    
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(10876, 200)

In [26]:
from sklearn.model_selection import train_test_split


train_w2v = wordvec_df.iloc[:7613,:]
test_w2v = wordvec_df.iloc[7613:,:]

_, _, ytrain, yvalid = train_test_split(train_w2v, train['target'],  random_state=42, test_size=0.3)



print(train_w2v.shape, train['target'].shape)

xtrain_w2v = train_w2v.iloc[ytrain.index,:]
xvalid_w2v = train_w2v.iloc[yvalid.index,:]

(7613, 200) (7613,)


In [27]:
import tensorflow as tf
import keras.layers as layers
from keras.models import Model

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Input,Embedding,Dense,Flatten
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import f1_score

epochs = 25
batch_size = 1024
loss = "binary_crossentropy"
optimizer = "adam"
metrics = ["accuracy"]

from keras import models

callbacks = [EarlyStopping(monitor='val_loss', patience=2),
            ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

model = models.Sequential()
model.add(Dense(512, activation='relu', input_shape=(200,)))
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss=loss,optimizer=optimizer,metrics= metrics)
model.fit(xtrain_w2v,ytrain,epochs=epochs,batch_size=batch_size,callbacks=callbacks,validation_data=(xvalid_w2v,yvalid))


predictions = model.predict(xvalid_w2v)
predictions = [0 if i<0.5 else 1 for i in predictions]

f1_score(yvalid, predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25


0.726457399103139

In [28]:
test_pred = model.predict(test_w2v)
test_pred = [0 if i<0.5 else 1 for i in test_pred]

test['target'] = test_pred
submission = test[['id','target']]
submission.to_csv('submission6.csv', index=False)

In [29]:
submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


# ACCURACY = 0.78179