# Import librairies

In [2]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import SimpleRNN,GRU,LSTM

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import re
#from sense2vec import Sense2Vec

# Exploring Data

- **Train Dataset**

In [3]:
dataset_train = pd.read_csv('train.csv')

In [4]:
dataset_train.describe(include='all')

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


Let's drop *id*

In [5]:
dataset_train=dataset_train.drop(columns='id')

- **Test dataset**

In [35]:
sample_submission=pd.read_csv("sample_submission.csv")

In [7]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [8]:
test_db1=pd.read_csv("test.csv")

In [9]:
test_db1.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Preprocessing train data

In [10]:
train_db=dataset_train

In [11]:
nlp = spacy.load("en_core_web_lg")

In [12]:
 # Remove all non alphanumeric characters except whitespaces
train_db["text_clean"] = train_db["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))

# remove double spaces and spaces at the beginning and end of strings
train_db["text_clean"] = train_db["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())

# remove stop words and replace everyword with their lemma
train_db["text_clean"] = train_db["text_clean"].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x))]) #if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)])

# Extracting hashtags
pat=re.compile(r"#(\w+)")
train_db['hashtags']=train_db['text'].apply(lambda x : pat.findall(x))

# Stadardization of keywords
train_db.keyword=train_db.keyword.apply(lambda x : ['nan'] if (isinstance(x, float) or pd.isna(x)) else [x])

#Creating a sequence with (<start> sentence <eos> keyword <htg> hashtags)
train_db['text_clean']=train_db['text_clean'].apply(lambda x : ["<start>"]+x+["<eos>"])
train_db['text_clean']=train_db.apply(lambda x: x['text_clean']+x['keyword']+["<htg>"]+x['hashtags'], axis =1)

# Delete all words containing a digit and less than 2 letters
train_db['text_clean']=train_db['text_clean'].apply(lambda x : [word for word in x if (not any(i.isdigit() for i in word) and len(word)>2)])

train_db.head()

Unnamed: 0,keyword,location,text,target,text_clean,hashtags
0,[nan],,Our Deeds are the Reason of this #earthquake M...,1,"[<start>, our, deed, the, reason, this, earthq...",[earthquake]
1,[nan],,Forest fire near La Ronge Sask. Canada,1,"[<start>, forest, fire, near, ronge, sask, can...",[]
2,[nan],,All residents asked to 'shelter in place' are ...,1,"[<start>, all, resident, ask, shelter, place, ...",[]
3,[nan],,"13,000 people receive #wildfires evacuation or...",1,"[<start>, people, receive, wildfire, evacuatio...",[wildfires]
4,[nan],,Just got sent this photo from Ruby #Alaska as ...,1,"[<start>, just, got, send, this, photo, from, ...","[Alaska, wildfires]"


In [13]:
train_db.loc[train_db['hashtags'].apply(lambda x : 'wildfires' in x)]

Unnamed: 0,keyword,location,text,target,text_clean,hashtags
3,[nan],,"13,000 people receive #wildfires evacuation or...",1,"[<start>, people, receive, wildfire, evacuatio...",[wildfires]
4,[nan],,Just got sent this photo from Ruby #Alaska as ...,1,"[<start>, just, got, send, this, photo, from, ...","[Alaska, wildfires]"
5,[nan],,#RockyFire Update => California Hwy. 20 closed...,1,"[<start>, rockyfire, update, california, hwy, ...","[RockyFire, CAfire, wildfires]"
3371,[evacuation],"Bend, Oregon",Evacuation Advisory for Swayback Ridge Area..v...,1,"[<start>, evacuation, advisory, for, swayback,...","[wildfires, calfires]"
5572,[rainstorm],"North Vancouver, BC",Yay I can feel the wind gearing up for a rains...,1,"[<start>, yay, can, feel, the, wind, gear, for...","[Vancouver, drought, deadgrassandflowers, wild..."


In [62]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=2500) # instanciate the tokenizer
# num_words indicates the number of words to keep in the tokenization
# keeps only the most common words

tokenizer.fit_on_texts(train_db.text_clean) # fit the tokenizer on the texts
# in this step the tokenizer will list all unique tokens in the text
# and associate them with a specific integer.

# This step will effectively transform the texts into sequences of indices
train_db["text_encoded"] = tokenizer.texts_to_sequences(train_db.text_clean)



#tokenizer_htg = tf.keras.preprocessing.text.Tokenizer(num_words=2000)
#tokenizer_htg.fit_on_texts(train_db.hashtags)# fit the tokenizer on the texts
#train_db["hashtags_encoded"] = tokenizer_htg.texts_to_sequences(train_db.hashtags)

# Sometimes the preprocessing removes all the words in a string (because they contain
# only stopwords for example) so we calculate the length in order to filter out
# those records
train_db["len_text"] = train_db["text_encoded"].apply(lambda x: len(x))
train_db = train_db[train_db["len_text"]!=0]

In [63]:
tokenizer.texts_to_sequences(['wildfires'])

[[2121]]

In [64]:
train_db

Unnamed: 0,keyword,location,text,target,text_clean,hashtags,text_encoded,len_text
0,[nan],,Our Deeds are the Reason of this #earthquake M...,1,"[<start>, our, deed, the, reason, this, earthq...",[earthquake],"[1, 129, 4, 592, 12, 104, 150, 1145, 2120, 25,...",14
1,[nan],,Forest fire near La Ronge Sask. Canada,1,"[<start>, forest, fire, near, ronge, sask, can...",[],"[1, 235, 14, 306, 1146, 2, 267, 3]",8
2,[nan],,All residents asked to 'shelter in place' are ...,1,"[<start>, all, resident, ask, shelter, place, ...",[],"[1, 25, 1550, 623, 1886, 500, 465, 307, 147, 1...",16
3,[nan],,"13,000 people receive #wildfires evacuation or...",1,"[<start>, people, receive, wildfire, evacuatio...",[wildfires],"[1, 37, 2478, 89, 147, 466, 86, 2, 267, 3, 2121]",11
4,[nan],,Just got sent this photo from Ruby #Alaska as ...,1,"[<start>, just, got, send, this, photo, from, ...","[Alaska, wildfires]","[1, 18, 354, 339, 12, 236, 13, 1551, 155, 13, ...",19
...,...,...,...,...,...,...,...,...
7608,[nan],,Two giant cranes holding a bridge collapse int...,1,"[<start>, two, giant, crane, hold, bridge, col...",[],"[1, 131, 761, 1119, 490, 411, 59, 52, 665, 69,...",13
7609,[nan],,@aria_ahrary @TheTawniest The out of control w...,1,"[<start>, ariaahrary, thetawniest, the, out, c...",[],"[1, 4, 24, 729, 376, 14, 86, 216, 4, 261, 440,...",17
7610,[nan],,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[<start>, volcano, hawaii, <eos>, nan, <htg>]",[],"[1, 278, 1421, 2, 267, 3]",6
7611,[nan],,Police investigating after an e-bike collided ...,1,"[<start>, police, investigate, after, ebike, c...",[],"[1, 49, 905, 28, 110, 11, 105, 375, 1710, 1318...",16


In [65]:
train_text_pad_pre = tf.keras.preprocessing.sequence.pad_sequences(train_db.text_encoded, padding="pre")
#train_htg_pad_pre = tf.keras.preprocessing.sequence.pad_sequences(train_db.hashtags_encoded, padding="pre")

In [66]:
text_len=train_text_pad_pre.shape[1]

In [67]:
#htg_len=train_htg_pad_pre.shape[1]

In [68]:
#train_ds = tf.data.Dataset.from_tensor_slices((np.concatenate((train_text_pad_pre, train_htg_pad_pre), axis=1), train_db.target))
train_ds = tf.data.Dataset.from_tensor_slices((train_text_pad_pre, train_db.target))

In [69]:
train_ds = tf.data.Dataset.from_tensor_slices((train_text_pad_pre, train_db.target))

# Train / Test / Val distribution with the same ratio of target value

nb_class = 2
TEST_SIZE = 0
VAL_SIZE = 0.3

for i in range(nb_class):
    classe_i_dataset = train_ds.filter(lambda x,y: y ==i)
    
    DATA_SIZE = len(list(classe_i_dataset))

    classe_i_dataset = classe_i_dataset.shuffle(DATA_SIZE)
    class_i_sample_len = int(DATA_SIZE * (1-TEST_SIZE))
    classe_i_train = classe_i_dataset.take(class_i_sample_len)
    
    classe_i_test = classe_i_dataset.skip(class_i_sample_len)
    
    class_i_val_len = int(DATA_SIZE *VAL_SIZE)
    classe_i_val=classe_i_train.take(class_i_val_len)


    classe_i_train=classe_i_train.skip(class_i_val_len)


    if i ==0 :
        train_dataset=classe_i_train
        test_dataset = classe_i_test
        val_dataset = classe_i_val
    else :
        train_dataset=train_dataset.concatenate(classe_i_train)
        test_dataset=test_dataset.concatenate(classe_i_test)
        val_dataset=val_dataset.concatenate(classe_i_val)

train_ds = train_dataset.shuffle(len(list(train_dataset))).batch(64)

In [70]:
print('Train dataset size = ', len(list(train_dataset)))
print('Test dataset size = ', len(list(test_dataset)))
print('val dataset size = ', len(list(val_dataset)))

Train dataset size =  5330
Test dataset size =  0
val dataset size =  2283


In [71]:
train_ds = train_dataset.shuffle(len(list(train_dataset))).batch(64)
val_ds = val_dataset.shuffle(len(list(val_dataset))).batch(64)

In [73]:
texts,scores = next(iter(train_ds))

In [79]:
embedding_dim=768 # the dimensionality of the representation space
vocab_size=2500

model = Sequential([
  Embedding(vocab_size, embedding_dim, name="embedding",input_length=len(texts[1])), # the embedding layer
  # the input dim needs to be equal to the size of the vocabulary
  LSTM(units=768, return_sequences=True), # maintains the sequential nature
  LSTM(units=256, return_sequences=True), # maintains the sequential nature
  LSTM(units=64, return_sequences=True), # maintains the sequential nature
  LSTM(units=32, return_sequences=False), # returns the last output
  Dense(16, activation='relu'), # a dense layer
  Dense(8, activation='relu'), # a dense layer
  Dense(2, activation="softmax") # the prediction layer
])

In [80]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 27, 256)           640000    
                                                                 
 lstm_6 (LSTM)               (None, 27, 256)           525312    
                                                                 
 lstm_7 (LSTM)               (None, 27, 128)           197120    
                                                                 
 lstm_8 (LSTM)               (None, 27, 64)            49408     
                                                                 
 lstm_9 (LSTM)               (None, 32)                12416     
                                                                 
 dense_6 (Dense)             (None, 16)                528       
                                                                 
 dense_7 (Dense)             (None, 8)                

In [81]:
# Let's create a learning rate schedule to decrease the learning rate as we train the model. 
initial_learning_rate = 0.001

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.97,
    staircase=True)

In [82]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = lr_schedule),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [86]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x2951e661b50>

In [87]:
def preprocessing(dataset):

    # Remove all non alphanumeric characters except whitespaces
    dataset["text_clean"] = dataset["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))

    # remove double spaces and spaces at the beginning and end of strings
    dataset["text_clean"] = dataset["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())

    # remove stop words and replace everyword with their lemma
    dataset["text_clean"] = dataset["text_clean"].apply(lambda x: [token.lemma_ for token in nlp(x)]) #if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)])

    # Extracting hashtags
    pat=re.compile(r"#(\w+)")
    dataset['hashtags']=dataset['text'].apply(lambda x : pat.findall(x))

    # Stadardization of keywords
    dataset.keyword=dataset.keyword.apply(lambda x : ['nan'] if (isinstance(x, float) or pd.isna(x)) else [x])

    #Creating a sequence with (<start> sentence <eos> keyword <htg> hashtags)
    dataset['text_clean']=dataset['text_clean'].apply(lambda x : ["<start>"]+x+["<eos>"])
    dataset['text_clean']=dataset.apply(lambda x: x['text_clean']+x['keyword']+["<htg>"]+x['hashtags'], axis =1)

    # Delete all words containing a digit and less than 2 letters
    dataset['text_clean']=dataset['text_clean'].apply(lambda x : [word for word in x if (not any(i.isdigit() for i in word) and len(word)>2)])

    
    # This step will effectively transform the texts into sequences of indices
    dataset["text_encoded"] = tokenizer.texts_to_sequences(dataset.text_clean)
    #dataset["hashtags_encoded"] = tokenizer_htg.texts_to_sequences(dataset.hashtags)

    set_text_pad_pre = tf.keras.preprocessing.sequence.pad_sequences(dataset.text_encoded, padding="pre", maxlen=text_len)
    #set_htg_pad_pre = tf.keras.preprocessing.sequence.pad_sequences(dataset.hashtags_encoded, padding="pre", maxlen=htg_len)

    #return tf.constant(np.concatenate((set_text_pad_pre, set_htg_pad_pre), axis=1))
    return tf.constant(set_text_pad_pre)

In [88]:
test_db=pd.read_csv("test.csv")

In [89]:
test_db.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [90]:
test_db=pd.read_csv("test.csv")
test_ds=preprocessing(test_db)

In [91]:
test_ds

<tf.Tensor: shape=(3263, 27), dtype=int32, numpy=
array([[   0,    0,    0, ...,    2,  267,    3],
       [   0,    0,    0, ...,  267,    3,  104],
       [   0,    0,    0, ...,    2,  267,    3],
       ...,
       [   0,    0,    0, ...,    2,  267,    3],
       [   0,    0,    0, ...,    2,  267,    3],
       [   0,    0,    0, ...,  267,    3, 2448]])>

In [96]:
test_db['prediction']=np.argmax(model(test_ds),axis=1)

In [98]:
test_db

Unnamed: 0,id,keyword,location,text,text_clean,hashtags,prediction
0,0,[nan],,Just happened a terrible car crash,"[<start>, just, happen, terrible, car, crash, ...",[],0
1,2,[nan],,"Heard about #earthquake is different cities, s...","[<start>, hear, about, earthquake, different, ...",[earthquake],1
2,3,[nan],,"there is a forest fire at spot pond, geese are...","[<start>, there, forest, fire, spot, pond, goo...",[],1
3,9,[nan],,Apocalypse lighting. #Spokane #wildfires,"[<start>, apocalypse, lighting, spokane, wildf...","[Spokane, wildfires]",1
4,11,[nan],,Typhoon Soudelor kills 28 in China and Taiwan,"[<start>, typhoon, soudelor, kill, china, and,...",[],1
...,...,...,...,...,...,...,...
3258,10861,[nan],,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,"[<start>, earthquake, safety, los, angeles, sa...",[],1
3259,10865,[nan],,Storm in RI worse than last hurricane. My city...,"[<start>, storm, bad, than, last, hurricane, h...",[],1
3260,10868,[nan],,Green Line derailment in Chicago http://t.co/U...,"[<start>, green, line, derailment, chicago, ht...",[],1
3261,10874,[nan],,MEG issues Hazardous Weather Outlook (HWO) htt...,"[<start>, meg, issue, hazardous, weather, outl...",[],1


In [99]:
sample_submission['target']=test_db['prediction']
sample_submission.to_csv("submission.csv", index=False)

# With Attention Layers

In [207]:
dataset_train = pd.read_csv('train.csv')

In [208]:
train_db=dataset_train

In [209]:
# Remove all non alphanumeric characters except whitespaces
train_db["text_clean"] = train_db["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
# remove double spaces and spaces at the beginning and end of strings
train_db["text_clean"] = train_db["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
# remove stop words and replace everyword with their lemma
train_db["text_clean"] = train_db["text_clean"].apply(lambda x: [token.lemma_ for token in nlp(x)])
train_db.head()

Unnamed: 0,id,keyword,location,text,target,text_clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[our, deed, be, the, reason, of, this, earthqu..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[all, resident, ask, to, shelter, in, place, b..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, people, receive, wildfire, evacuation,..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[just, got, send, this, photo, from, ruby, ala..."


In [210]:
train_db['text_clean']=train_db['text_clean'].apply(lambda x : x+["<eos>"])

In [211]:
train_db.iloc[0,5]

['our',
 'deed',
 'be',
 'the',
 'reason',
 'of',
 'this',
 'earthquake',
 'may',
 'allah',
 'forgive',
 'we',
 'all',
 '<eos>']

In [212]:
pat=re.compile(r"#(\w+)")
train_db['hashtags']=train_db['text'].apply(lambda x : pat.findall(x))

In [213]:
train_db.keyword=train_db.keyword.apply(lambda x : ['nan'] if (isinstance(x, float) or pd.isna(x)) else [x])

In [214]:
train_db.hashtags

0              [earthquake]
1                        []
2                        []
3               [wildfires]
4       [Alaska, wildfires]
               ...         
7608                     []
7609                     []
7610                     []
7611                     []
7612                     []
Name: hashtags, Length: 7613, dtype: object

In [215]:
train_db['text_clean']=train_db.apply(lambda x: x['text_clean']+x['keyword']+["<htg>"]+x['hashtags'], axis =1)

In [216]:
train_db.iloc[0,5]

['our',
 'deed',
 'be',
 'the',
 'reason',
 'of',
 'this',
 'earthquake',
 'may',
 'allah',
 'forgive',
 'we',
 'all',
 '<eos>',
 'nan',
 '<htg>',
 'earthquake']

In [217]:
# Delete all words containing a digit and less than 2 letters
train_db['text_clean']=train_db['text_clean'].apply(lambda x : [word for word in x if (not any(i.isdigit() for i in word) and len(word)>2)])

In [218]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=2500) # instanciate the tokenizer
# num_words indicates the number of words to keep in the tokenization
# keeps only the most common words

tokenizer.fit_on_texts(train_db.text_clean) # fit the tokenizer on the texts
# in this step the tokenizer will list all unique tokens in the text
# and associate them with a specific integer.

# This step will effectively transform the texts into sequences of indices
train_db["text_encoded"] = tokenizer.texts_to_sequences(train_db.text_clean)

# Sometimes the preprocessing removes all the words in a string (because they contain
# only stopwords for example) so we calculate the length in order to filter out
# those records
train_db["len_text"] = train_db["text_encoded"].apply(lambda x: len(x))
train_db = train_db[train_db["len_text"]!=0]

In [219]:
train_db

Unnamed: 0,id,keyword,location,text,target,text_clean,hashtags,text_encoded,len_text
0,1,[nan],,Our Deeds are the Reason of this #earthquake M...,1,"[our, deed, the, reason, this, earthquake, may...",[earthquake],"[128, 3, 591, 11, 103, 149, 1144, 2119, 24, 1,...",13
1,4,[nan],,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, ronge, sask, canada, <eos...",[],"[234, 13, 305, 1145, 1, 266, 2]",7
2,5,[nan],,All residents asked to 'shelter in place' are ...,1,"[all, resident, ask, shelter, place, notify, o...",[],"[24, 1549, 622, 1885, 499, 464, 306, 146, 1885...",15
3,6,[nan],,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfire, evacuation, order,...",[wildfires],"[36, 2477, 88, 146, 465, 85, 1, 266, 2, 2120]",10
4,7,[nan],,Just got sent this photo from Ruby #Alaska as ...,1,"[just, got, send, this, photo, from, ruby, ala...","[Alaska, wildfires]","[17, 353, 338, 11, 235, 12, 1550, 154, 12, 88,...",18
...,...,...,...,...,...,...,...,...,...
7608,10869,[nan],,Two giant cranes holding a bridge collapse int...,1,"[two, giant, crane, hold, bridge, collapse, in...",[],"[130, 760, 1118, 489, 410, 58, 51, 664, 68, 1,...",12
7609,10870,[nan],,@aria_ahrary @TheTawniest The out of control w...,1,"[ariaahrary, thetawniest, the, out, control, w...",[],"[3, 23, 728, 375, 13, 85, 215, 3, 260, 439, 3,...",16
7610,10871,[nan],,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[volcano, hawaii, <eos>, nan, <htg>]",[],"[277, 1420, 1, 266, 2]",5
7611,10872,[nan],,Police investigating after an e-bike collided ...,1,"[police, investigate, after, ebike, collide, w...",[],"[48, 904, 27, 109, 10, 104, 374, 1709, 1317, 1...",15


In [220]:
train_db.iloc[3,5]

['people',
 'receive',
 'wildfire',
 'evacuation',
 'order',
 'california',
 '<eos>',
 'nan',
 '<htg>',
 'wildfires']

In [238]:
train_db.iloc[0,7]

[128, 3, 591, 11, 103, 149, 1144, 2119, 24, 1, 266, 2, 103]

In [222]:
train_text_pad_pre = tf.keras.preprocessing.sequence.pad_sequences(train_db.text_encoded, padding="pre")

In [231]:
# let's add a start block in target
train_db.target=train_db.target.apply(lambda x: x[1])

In [232]:
train_db

Unnamed: 0,id,keyword,location,text,target,text_clean,hashtags,text_encoded,len_text
0,1,[nan],,Our Deeds are the Reason of this #earthquake M...,1,"[our, deed, the, reason, this, earthquake, may...",[earthquake],"[128, 3, 591, 11, 103, 149, 1144, 2119, 24, 1,...",13
1,4,[nan],,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, ronge, sask, canada, <eos...",[],"[234, 13, 305, 1145, 1, 266, 2]",7
2,5,[nan],,All residents asked to 'shelter in place' are ...,1,"[all, resident, ask, shelter, place, notify, o...",[],"[24, 1549, 622, 1885, 499, 464, 306, 146, 1885...",15
3,6,[nan],,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfire, evacuation, order,...",[wildfires],"[36, 2477, 88, 146, 465, 85, 1, 266, 2, 2120]",10
4,7,[nan],,Just got sent this photo from Ruby #Alaska as ...,1,"[just, got, send, this, photo, from, ruby, ala...","[Alaska, wildfires]","[17, 353, 338, 11, 235, 12, 1550, 154, 12, 88,...",18
...,...,...,...,...,...,...,...,...,...
7608,10869,[nan],,Two giant cranes holding a bridge collapse int...,1,"[two, giant, crane, hold, bridge, collapse, in...",[],"[130, 760, 1118, 489, 410, 58, 51, 664, 68, 1,...",12
7609,10870,[nan],,@aria_ahrary @TheTawniest The out of control w...,1,"[ariaahrary, thetawniest, the, out, control, w...",[],"[3, 23, 728, 375, 13, 85, 215, 3, 260, 439, 3,...",16
7610,10871,[nan],,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[volcano, hawaii, <eos>, nan, <htg>]",[],"[277, 1420, 1, 266, 2]",5
7611,10872,[nan],,Police investigating after an e-bike collided ...,1,"[police, investigate, after, ebike, collide, w...",[],"[48, 904, 27, 109, 10, 104, 374, 1709, 1317, 1...",15


In [239]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_text_pad_pre,train_db['target'].values, test_size=0.3)

In [240]:
X_train

array([[  0,   0,   0, ...,   1,  74,   2],
       [  0,   0,   0, ..., 103,   2, 831],
       [  0,   0,   0, ...,   1, 323,   2],
       ...,
       [  0,   0,   0, ...,   1, 264,   2],
       [  0,   0,   0, ...,   1, 441,   2],
       [  0,   0,   0, ...,   1, 204,   2]])

In [241]:
y_train=np.array([[1,i] for i in y_train])
y_val=np.array([[1,i] for i in y_val])

In [242]:
y_train

array([[1, 0],
       [1, 1],
       [1, 0],
       ...,
       [1, 1],
       [1, 0],
       [1, 1]])

In [255]:
BATCH_SIZE=64

train_batch = tf.data.Dataset.from_tensor_slices((X_train,y_train)).shuffle(len(X_train)).batch(BATCH_SIZE)
val_batch=tf.data.Dataset.from_tensor_slices((X_val,y_val)).shuffle(len(X_val)).batch(BATCH_SIZE)

2- Modeling

In [312]:
n_embed = 128
n_gru = 64
vocab_size_fr=2500
vocab_imp_size = vocab_size_fr+1
vocab_tar_size = 3

In [313]:
class Encoder(tf.keras.Model):
    def __init__(self, in_vocab_size, embed_dim, n_units):
        super().__init__()
        # instanciate an embedding layer
        self.n_units = n_units
        self.embed = tf.keras.layers.Embedding(input_dim=in_vocab_size,
                                               output_dim=embed_dim)
        # instantiate GRU layer
        self.gru = tf.keras.layers.GRU(units=n_units,
                                       return_sequences=True,
                                       return_state=True)

    def call(self, input_batch):
        # each output will be saved as a class attribute so we can easily access
        # them to control the shapes throughout the demo
        embed_out = self.embed(input_batch)
        gru_out, gru_state = self.gru(embed_out)

        return gru_out, gru_state

In [314]:
encoder = Encoder(vocab_imp_size, n_embed, n_gru)

In [315]:
encoder_output,encoder_state = encoder(tf.expand_dims(train_text_pad_pre[0], 0))

In [316]:
encoder_output

<tf.Tensor: shape=(1, 26, 64), dtype=float32, numpy=
array([[[ 0.00510759, -0.00452322, -0.02219347, ...,  0.00268356,
          0.01137654,  0.01860691],
        [ 0.00862886, -0.00896753, -0.03306883, ...,  0.0067078 ,
          0.01800991,  0.0254958 ],
        [ 0.01097634, -0.01218579, -0.03820089, ...,  0.00975473,
          0.02190971,  0.02796363],
        ...,
        [-0.0232943 ,  0.00368995, -0.0047837 , ...,  0.01361852,
          0.00804966, -0.01761423],
        [-0.02499404,  0.0169581 , -0.01824763, ...,  0.00387126,
         -0.01066962,  0.0094427 ],
        [-0.01262574, -0.01064528, -0.03226487, ...,  0.01118626,
          0.00317481, -0.00624221]]], dtype=float32)>

In [317]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, attention_units):
        super().__init__()

        # The attention layer contains three dense layers
        self.W1 = tf.keras.layers.Dense(units=attention_units)
        self.W2 = tf.keras.layers.Dense(units=attention_units)
        self.V = tf.keras.layers.Dense(units=1)

    def call(self, enc_out, state):
        # the choice of name of the arguments here is not random, enc_out
        # will represent the encoder output which will be used to create
        # the attention weights and then used to create the context vector once we
        # apply the attention weights
        # the state will be a hidden state from a recurrent unit coming either
        # from the encoder at first, and from the decoder as we make further 
        # predictions
        W1_out = self.W1(enc_out)  # shape (1, 27, attention_units)

        # If you have taken a close look the model's schema you would have noticed
        # that we are going to sum the outputs from W1 and W2, though the shapes
        # are incompatible
        # the enc_out is (batch_size, 27, 256) -> W1 -> (batch_size, 27, attention_units)
        # the state is (batch_size, 256) -> W2 -> (batch_size, attention_units)
        # thus we need to artificially add a dimension to the state along axis 1
        state = tf.expand_dims(state, axis=1)
        W2_out = self.W2(state)  # shape (batch_size, 1, attention_units)

        Wsum = W1_out + W2_out  # shape (batch_size, 27, attention_units)
        scaled_Wsum = tf.nn.tanh(Wsum)  # shape (batch_size, 27, attention_units)

        score = self.V(scaled_Wsum)  # shape (batch_size, 27, 1)

        attention_weights = tf.nn.softmax(score, axis=1)  # shape (batch_size, 27, 1)

        weighted_enc_out = enc_out * attention_weights  # shape (batch_size, 27, 256)

        context_vector = tf.reduce_sum(weighted_enc_out, axis=1)  # shape (batch_size, 256)

        return context_vector, attention_weights

In [318]:
text_len = train_text_pad_pre.shape[1]
attention_layer = BahdanauAttention(text_len)

In [319]:
context_vector, attention_weights=attention_layer(encoder_output,encoder_state)

In [320]:
class Decoder(tf.keras.Model):
    def __init__(self, tar_vocab_size, embed_dim, n_units):
        super().__init__()
        # The decoder contains an embedding layer to play with the teacher forcing
        # input, which comes from the target data
        # A GRU layer
        # A dense layer to make the predictions
        # And an attention layer
        self.embed = tf.keras.layers.Embedding(input_dim=tar_vocab_size, 
                                               output_dim=embed_dim)
        self.gru = tf.keras.layers.GRU(units=n_units,
                                       return_sequences=True,
                                       return_state=True)
        self.pred = tf.keras.layers.Dense(units=tar_vocab_size,
                                          activation="softmax")
        self.attention = BahdanauAttention(attention_units=n_units)

    def call(self, dec_in, enc_out, state):
        # first let's apply the attention layer
        context_vector, attention_weights = self.attention(enc_out, state)

        # now the decoder will ingest one sequence element from the teacher forcing
        # this will be of shape (bacth_size, 1)
        embed_out = self.embed(dec_in)  # shape (batch_size, 1, embed_dim)

        # then we need to concatenate the embedding output and the context vector
        # though their shapes are incompatible
        # embed out (batch_size, 1, embed_dim)
        # context vector (batch_size, n_units) where n_units was defined in the encoder
        # so we need to add one dimension along axis 1
        context_vector_expanded = tf.expand_dims(context_vector, axis=1)
        # shape (batch_size, 1, n_units)
        concat = tf.keras.layers.concatenate([embed_out, context_vector_expanded])
        # shape (bacth_size, 1, embed_dim + n_units)
    
        # now we get to apply the GRU layer
        gru_out, gru_state = self.gru(concat) 
        # shapes (batch_size, 1, n_units) and (batch_size, n_units)

        # let's reshape the gru output before feeding it to the dense layer
        gru_out_reshape = tf.reshape(gru_out,
                                     shape=(-1, gru_out.shape[2]))

        # now let's make a prediction
        pred_out = self.pred(gru_out_reshape)
        # shape (batch_size, 1, tar_vocab_size)

        return pred_out, gru_state, attention_weights

In [321]:
y_train.shape[1] - 1

1

In [322]:
decoder = Decoder(tar_vocab_size=vocab_tar_size, embed_dim=n_embed, n_units=n_gru)

In [323]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [324]:
inp, targ = next(iter(train_batch))

In [325]:
def train_step(inp, targ):
    loss = 0

    # we use the gradient tape to track all
    # the different operations happening in the network in order to be able
    # to compute the gradients later
    with tf.GradientTape() as tape:

        # the input sequence is fed to the encoder
        # to produce the encoder output and the encoder state
        enc_output, enc_state = encoder(inp)
        # the initial state used in the decoder is the encoder state
        dec_state = enc_state

        # the first decoder input is the first sequence element of the target batch,
        # which in our case represents the <start> token for each sequence in the batch.
        # This is what we call the teacher forcing!
        dec_input = tf.expand_dims(targ[:, 0], axis=1)

        # Everything is set up for the first step, now we need to loop over the
        # teacher forcing sequence to produce the predictions, we already have 
        # defined the first step (element 0) so we will loop from 1 to targ.shape[1]
        # which is the target sequence length
        for t in range(1, targ.shape[1]):     
            # passing dec_input, dec_state and enc_output to the decoder
            # in order to produce the prediction, the new state,
            # and the attention weights which we will not need explicitely here
            pred, dec_state, _ = decoder(dec_input, enc_output, dec_state)

            # we compare the prediction produced by teacher forcing
            # with the next element of the target and increment the loss
            loss += loss_function(targ[:, t], pred)

            # The new decoder input becomes the next element of the target sequence
            # which we just attempted to predict (teacher forcing)
            dec_input = tf.expand_dims(targ[:, t], 1)

    # we divide the loss by the target sequence's length to get the average loss across the sequence
    batch_loss = loss

    # here we concatenate the lists of trainable variables
    # for the encoder and the decoder
    variables = encoder.trainable_variables + decoder.trainable_variables

    # compute the gradient based on the loss and the trainable variables
    gradients = tape.gradient(loss, variables)

    # then update the model's parameters
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [None]:
import time
EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(train_batch):
        batch_loss = train_step(inp, targ)
        total_loss += batch_loss

        if batch % 10 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
  
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss))
    print('Time taken for 1 epoch {} sec'.format(time.time() - start))

    # classic encoder input
    enc_input = X_val

    # the first decoder input is the special token 0
    dec_input = tf.ones(shape=(len(X_val), 1))

    # we compute once and for all the encoder output and the encoder h state and c state
    enc_out, enc_state = encoder(enc_input)

    # The encoder h state and c state will serve as initial states for the decoder
    dec_state = enc_state

    pred = []  # we'll store the predictions in here

    # we loop over the expected length of the target, but actually the loop can run
    # for as many steps as we wish, which is the advantage of the encoder decoder
    # architecture
    for i in range(1):
        # the decoder state is updated and we get the first prediction probability vector
        dec_out, dec_state, attention_w = decoder(dec_input, enc_out, dec_state)

        # we decode the softmax vector into and index
        decoded_out = tf.expand_dims(tf.argmax(dec_out, axis=-1), axis=1)

        # update the prediction list
        pred.append(tf.expand_dims(dec_out,axis=1))

        # the previous pred will be used as the new input
        dec_input = decoded_out

    pred = tf.concat(pred, axis=1).numpy()
    print("\n val loss:", loss_function(y_val[:, 1:], pred), "\n")

In [340]:
# classic encoder input
enc_input = X_val

# the first decoder input is the special token 0
dec_input = tf.ones(shape=(len(enc_input), 1))

# we compute once and for all the encoder output and the encoder h state and c state
enc_out, enc_state = encoder(enc_input)

# The encoder h state and c state will serve as initial states for the decoder
dec_state = enc_state

# we'll store the predictions in here
pred = []

# we loop over the expected length of the target, but actually the loop can run
# for as many steps as we wish, which is the advantage of the encoder decoder
# architecture

# the decoder state is updated and we get the first prediction probability vector
dec_out, dec_state, attention_w = decoder(dec_input, enc_out, dec_state)

print(dec_input.shape,enc_out.shape,dec_state.shape)

# we decode the softmax vector into and index
decoded_out = tf.expand_dims(tf.argmax(dec_out, axis=-1), axis=1)

# update the prediction list
pred.append(decoded_out)

# the previous pred will be used as the new input
dec_input = decoded_out

pred = tf.concat(pred, axis=1).numpy()


(2284, 1) (2284, 26, 64) (2284, 64)


In [342]:
pred = pred[:,0]

In [343]:
(pred==y_val[:, 1]).sum()/y_val.shape[0]

0.7530647985989493

In [681]:
def preprocessing(dataset):

    # Remove all non alphanumeric characters except whitespaces
    dataset["text_clean"] = dataset["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))

    # remove double spaces and spaces at the beginning and end of strings
    dataset["text_clean"] = dataset["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())

    # remove stop words and replace everyword with their lemma
    dataset["text_clean"] = dataset["text_clean"].apply(lambda x: [token.lemma_ for token in nlp(x)]) #if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)])
    
    # Extracting hashtags
    pat=re.compile(r"#(\w+)")
    dataset['hashtags']=dataset['text'].apply(lambda x : pat.findall(x))

    # Stadardization of keywords
    dataset.keyword=dataset.keyword.apply(lambda x : ['nan'] if (isinstance(x, float) or pd.isna(x)) else [x])

    #Creating a sequence with (<start> sentence <eos> keyword <htg> hashtags)
    dataset['text_clean']=dataset['text_clean'].apply(lambda x : ["<start>"]+x+["<eos>"])
    dataset['text_clean']=dataset.apply(lambda x: x['text_clean']+x['keyword']+["<htg>"]+x['hashtags'], axis =1)

    # Delete all words containing a digit and less than 2 letters
    dataset['text_clean']=dataset['text_clean'].apply(lambda x : [word for word in x if (not any(i.isdigit() for i in word) and len(word)>2)])

    
    # This step will effectively transform the texts into sequences of indices
    dataset["text_encoded"] = tokenizer.texts_to_sequences(dataset.text_clean)

    set_text_pad_pre = tf.keras.preprocessing.sequence.pad_sequences(dataset.text_encoded, padding="pre", maxlen=text_len)

    return set_text_pad_pre

In [682]:
test_db=pd.read_csv("test.csv")
test_ds=preprocessing(test_db)

In [684]:
test_ds.shape

(3263, 26)

In [685]:
# classic encoder input
enc_input = test_ds

# the first decoder input is the special token 0
dec_input = tf.ones(shape=(len(enc_input), 1))

# we compute once and for all the encoder output and the encoder h state and c state
enc_out, enc_state = encoder(enc_input)

# The encoder h state and c state will serve as initial states for the decoder
dec_state = enc_state

# we'll store the predictions in here
pred = []

# we loop over the expected length of the target, but actually the loop can run
# for as many steps as we wish, which is the advantage of the encoder decoder
# architecture
print(dec_input.shape,enc_out.shape,dec_state.shape)
# the decoder state is updated and we get the first prediction probability vector
dec_out, dec_state, attention_w = decoder(dec_input, enc_out, dec_state)

# we decode the softmax vector into and index
decoded_out = tf.expand_dims(tf.argmax(dec_out, axis=-1), axis=1)

# update the prediction list
pred.append(decoded_out)

# the previous pred will be used as the new input
dec_input = decoded_out

pred = tf.concat(pred, axis=-1).numpy()
pred = pred[:,0]

(3263, 1) (3263, 26, 64) (3263, 64)


In [686]:
test_db['prediction']=pred

In [687]:
test_db

Unnamed: 0,id,keyword,location,text,text_clean,hashtags,text_encoded,prediction
0,0,[nan],,Just happened a terrible car crash,"[<start>, just, happen, terrible, car, crash, ...",[],"[1, 18, 340, 105, 41, 2, 267, 3]",1
1,2,[nan],,"Heard about #earthquake is different cities, s...","[<start>, hear, about, earthquake, different, ...",[earthquake],"[1, 252, 35, 104, 256, 546, 312, 2, 267, 3, 104]",1
2,3,[nan],,"there is a forest fire at spot pond, geese are...","[<start>, there, forest, fire, spot, pond, goo...",[],"[1, 42, 235, 14, 694, 800, 4, 562, 23, 9, 203,...",1
3,9,[nan],,Apocalypse lighting. #Spokane #wildfires,"[<start>, apocalypse, lighting, spokane, wildf...","[Spokane, wildfires]","[1, 241, 89, 2, 267, 3]",0
4,11,[nan],,Typhoon Soudelor kills 28 in China and Taiwan,"[<start>, typhoon, soudelor, kill, china, and,...",[],"[1, 222, 696, 54, 678, 5, 2, 267, 3]",1
...,...,...,...,...,...,...,...,...
3258,10861,[nan],,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,"[<start>, earthquake, safety, los, angeles, sa...",[],"[1, 104, 977, 977, 2, 267, 3]",1
3259,10865,[nan],,Storm in RI worse than last hurricane. My city...,"[<start>, storm, bad, than, last, hurricane, h...",[],"[1, 57, 180, 65, 158, 163, 745, 259, 63, 15, 4...",1
3260,10868,[nan],,Green Line derailment in Chicago http://t.co/U...,"[<start>, green, line, derailment, chicago, ht...",[],"[1, 853, 637, 183, 2, 267, 3]",1
3261,10874,[nan],,MEG issues Hazardous Weather Outlook (HWO) htt...,"[<start>, meg, issue, hazardous, weather, outl...",[],"[1, 295, 227, 318, 2, 267, 3]",0


In [689]:
sample_submission['target']=test_db['prediction']
sample_submission.to_csv("submission.csv", index=False)

# Using Bert Embedding

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # Required by bert_en_uncased_preprocess/3
import numpy as np

text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder = hub.KerasLayer("https://tfhub.dev/google/tn_bert/1",trainable=False)

encoder_inputs = preprocessor(text_input)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]
lstm_input=tf.expand_dims(pooled_output, axis=1)

embeder_model = tf.keras.Model(inputs=text_input, outputs=pooled_output)


In [5]:
model = Sequential([
  #embeder_model, # the embedding layer
  # the input dim needs to be equal to the size of the vocabulary
  Dense(512, activation='relu'), # a dense layer
  Dense(128, activation='relu'), # a dense layer
  Dense(64, activation='relu'), # a dense layer
  Dense(16, activation='relu'), # a dense layer
  Dense(8, activation='relu'), # a dense layer
  Dense(2, activation="softmax") # the prediction layer
])
model.build([None,768])

In [72]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 512)               393728    
                                                                 
 dense_25 (Dense)            (None, 128)               65664     
                                                                 
 dense_26 (Dense)            (None, 64)                8256      
                                                                 
 dense_27 (Dense)            (None, 16)                1040      
                                                                 
 dense_28 (Dense)            (None, 8)                 136       
                                                                 
 dense_29 (Dense)            (None, 2)                 18        
                                                                 
Total params: 468,842
Trainable params: 468,842
Non-tr

In [27]:
train_db["text_cleaned"]=train_db["text_clean"].apply(lambda x: ' '.join(x))

In [None]:
np.array([text])

In [60]:
titi=[texts[0].numpy().tolist()[0],texts[1].numpy().tolist()[0]]

In [62]:
tf.constant(titi)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[ 0.39643735,  0.3066105 , -0.73066515, ...,  0.05066866,
        -0.22604758, -0.27400368],
       [ 0.5651825 ,  0.503533  , -0.23281331, ..., -0.16912988,
        -0.46560633, -0.18402806]], dtype=float32)>

In [63]:
from tqdm import tqdm

texts=[]

for text in tqdm(train_db["text_cleaned"]):
    toto=embeder_model(np.array([text]))
    texts.append(toto.numpy().tolist()[0])

100%|██████████| 7613/7613 [27:09<00:00,  4.67it/s]


In [66]:
train_db['enbeded_text']=texts

In [68]:
train_db.to_csv('train_modified.csv')

In [3]:
train_db=pd.read_csv('train_modified.csv')

In [None]:
texts = train_db.enbeded_text.str.replace('[','')
texts = texts.str.replace(']','')
texts = texts.str.split(",")
texts = texts.apply(lambda x : [float(i) for i in x])

texts.to_list()

In [24]:
train_ds = tf.data.Dataset.from_tensor_slices((texts.to_list(), train_db.target))

# Train / Test / Val distribution with the same ratio of target value

nb_class = 2
TEST_SIZE = 0
VAL_SIZE = 0.3

for i in range(nb_class):
    classe_i_dataset = train_ds.filter(lambda x,y: y ==i)
    
    DATA_SIZE = len(list(classe_i_dataset))

    classe_i_dataset = classe_i_dataset.shuffle(DATA_SIZE)
    class_i_sample_len = int(DATA_SIZE * (1-TEST_SIZE))
    classe_i_train = classe_i_dataset.take(class_i_sample_len)
    
    classe_i_test = classe_i_dataset.skip(class_i_sample_len)
    
    class_i_val_len = int(DATA_SIZE *VAL_SIZE)
    classe_i_val=classe_i_train.take(class_i_val_len)


    classe_i_train=classe_i_train.skip(class_i_val_len)


    if i ==0 :
        train_dataset=classe_i_train
        test_dataset = classe_i_test
        val_dataset = classe_i_val
    else :
        train_dataset=train_dataset.concatenate(classe_i_train)
        test_dataset=test_dataset.concatenate(classe_i_test)
        val_dataset=val_dataset.concatenate(classe_i_val)

train_ds = train_dataset.shuffle(len(list(train_dataset))).batch(64)
val_ds = val_dataset.shuffle(len(list(val_dataset))).batch(64)

In [27]:
# Let's create a learning rate schedule to decrease the learning rate as we train the model. 
initial_learning_rate = 0.0001

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.97,
    staircase=True)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = lr_schedule),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])


model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x223dd47d4f0>

In [81]:
def preprocessing(dataset):

    # Remove all non alphanumeric characters except whitespaces
    dataset["text_clean"] = dataset["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))

    # remove double spaces and spaces at the beginning and end of strings
    dataset["text_clean"] = dataset["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())

    # remove stop words and replace everyword with their lemma
    dataset["text_clean"] = dataset["text_clean"].apply(lambda x: [token.lemma_ for token in nlp(x)])# if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)])
    
    # Extracting hashtags
    pat=re.compile(r"#(\w+)")
    dataset['hashtags']=dataset['text'].apply(lambda x : pat.findall(x))

    # Stadardization of keywords
    dataset.keyword=dataset.keyword.apply(lambda x : ['nan'] if (isinstance(x, float) or pd.isna(x)) else [x])

    #Creating a sequence with (<start> sentence <eos> keyword <htg> hashtags)
    dataset['text_clean']=dataset['text_clean'].apply(lambda x : ["<start>"]+x+["<eos>"])
    dataset['text_clean']=dataset.apply(lambda x: x['text_clean']+x['keyword']+["<htg>"]+x['hashtags'], axis =1)

    # Delete all words containing a digit and less than 2 letters
    dataset['text_clean']=dataset['text_clean'].apply(lambda x : [word for word in x if (not any(i.isdigit() for i in word) and len(word)>2)])
    dataset["text_cleaned"]=dataset["text_clean"].apply(lambda x: ' '.join(x))
    
    # Embedding text
    texts=[]

    for text in tqdm(dataset["text_cleaned"]):
        toto=embeder_model(np.array([text]))
        texts.append(toto.numpy().tolist()[0])


    return texts

In [82]:
test_db=pd.read_csv("test_modifiedt.csv")
test_ds=preprocessing(test_db)

100%|██████████| 3263/3263 [10:56<00:00,  4.97it/s]


In [83]:
test_db['enbeded_text']=test_ds
test_db.to_csv('test_modified.csv')

In [28]:
liste = [[1,2,3],[4,5,6]]

tf.constant(liste)

<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6]])>

In [29]:
test_db=pd.read_csv('test_modified.csv')

texts = test_db.enbeded_text.str.replace('[','')
texts = texts.str.replace(']','')
texts = texts.str.split(",")
texts = texts.apply(lambda x : [float(i) for i in x])

test_ds=tf.constant(np.array(texts.to_list()))


  texts = test_db.enbeded_text.str.replace('[','')
  texts = texts.str.replace(']','')


In [None]:
test_ds

In [31]:
test_db['prediction']=np.argmax(model(test_ds),axis=1)

In [33]:
test_db

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,text_clean,hashtags,text_cleaned,enbeded_text,prediction
0,0,0,['nan'],,Just happened a terrible car crash,"['<start>', 'just', 'happen', 'terrible', 'car...",[],<start> just happen terrible car crash <eos> n...,"[0.44229334592819214, 0.6518906354904175, -0.4...",1
1,1,2,['nan'],,"Heard about #earthquake is different cities, s...","['<start>', 'hear', 'about', 'earthquake', 'di...",['earthquake'],<start> hear about earthquake different city s...,"[0.3072817325592041, 0.5369487404823303, -0.56...",1
2,2,3,['nan'],,"there is a forest fire at spot pond, geese are...","['<start>', 'there', 'forest', 'fire', 'spot',...",[],<start> there forest fire spot pond goose flee...,"[0.24782955646514893, 0.5383267402648926, -0.5...",1
3,3,9,['nan'],,Apocalypse lighting. #Spokane #wildfires,"['<start>', 'apocalypse', 'lighting', 'spokane...","['Spokane', 'wildfires']",<start> apocalypse lighting spokane wildfire <...,"[0.25921839475631714, 0.5270876288414001, -0.4...",1
4,4,11,['nan'],,Typhoon Soudelor kills 28 in China and Taiwan,"['<start>', 'typhoon', 'soudelor', 'kill', 'ch...",[],<start> typhoon soudelor kill china and taiwan...,"[0.33549031615257263, 0.41376328468322754, -0....",1
...,...,...,...,...,...,...,...,...,...,...
3258,3258,10861,['nan'],,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,"['<start>', 'earthquake', 'safety', 'los', 'an...",[],<start> earthquake safety los angeles safety f...,"[0.44013434648513794, 0.33875393867492676, -0....",0
3259,3259,10865,['nan'],,Storm in RI worse than last hurricane. My city...,"['<start>', 'storm', 'bad', 'than', 'last', 'h...",[],<start> storm bad than last hurricane hard hit...,"[0.2696795165538788, 0.28528931736946106, -0.2...",1
3260,3260,10868,['nan'],,Green Line derailment in Chicago http://t.co/U...,"['<start>', 'green', 'line', 'derailment', 'ch...",[],<start> green line derailment chicago httptcou...,"[0.42827627062797546, 0.45181402564048767, -0....",1
3261,3261,10874,['nan'],,MEG issues Hazardous Weather Outlook (HWO) htt...,"['<start>', 'meg', 'issue', 'hazardous', 'weat...",[],<start> meg issue hazardous weather outlook hw...,"[0.3431391716003418, 0.6408101916313171, -0.42...",1


In [36]:
sample_submission['target']=test_db['prediction']
sample_submission.to_csv("submission.csv", index=False)