In [1]:
import pandas as pd
import numpy as np
import re
import string
from textblob import TextBlob
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 


from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [2]:
stemmer = SnowballStemmer("english")

In [3]:
pd.set_option('display.max_colwidth', 50)

In [4]:
train = pd.read_csv("train.csv",encoding = "ISO-8859-1")
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
test = pd.read_csv("test.csv",encoding = "ISO-8859-1")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
id          3263 non-null int64
keyword     3237 non-null object
location    2158 non-null object
text        3263 non-null object
dtypes: int64(1), object(3)
memory usage: 102.0+ KB


In [6]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    # replacing the punctuations with no space.
    # which in effect deletes the punctuation marks.
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks.
    return text.translate(translator)


In [7]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()  #quito_mayusculas
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  #quito caracteres inusuales
    text = re.sub('â' , '', text)  #quito caracteres inusuales
    text = re.sub('ª' , '', text)  #quito caracteres inusuales
    text = re.sub('ã' , '', text)  #quito caracteres inusuales
    text = re.sub('¼' , '', text)  #quito caracteres inusuales
    text = re.sub('target' , '', text)  #quito caracteres inusuales
    text = re.sub('\w*\d\w*', '', text)  # quito numeros
    return text

In [8]:
train_limpio = train.copy()
test_limpio = test.copy()

In [9]:
test_limpio['text'] = test_limpio['text'].apply(lambda x: clean_text(x))
train_limpio['text'] = train_limpio['text'].apply(lambda x: clean_text(x))

In [10]:
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,heard about earthquake is different cities sta...
2,3,,,there is a forest fire at spot pond geese are ...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills in china and taiwan


In [11]:
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,people receive wildfires evacuation orders in...,1
4,7,,,just got sent this photo from ruby alaska as s...,1


In [12]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: word_tokenize(x))
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deeds, are, the, reason, of, this, earth...",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[all, residents, asked, to, shelter, in, place...",1
3,6,,,"[people, receive, wildfires, evacuation, order...",1
4,7,,,"[just, got, sent, this, photo, from, ruby, ala...",1


In [13]:
test_limpio["text"] = test_limpio["text"].apply(lambda x: word_tokenize(x))
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,"[just, happened, a, terrible, car, crash]"
1,2,,,"[heard, about, earthquake, is, different, citi..."
2,3,,,"[there, is, a, forest, fire, at, spot, pond, g..."
3,9,,,"[apocalypse, lighting, spokane, wildfires]"
4,11,,,"[typhoon, soudelor, kills, in, china, and, tai..."


In [14]:
def remove_stopwords(word_tokens):
    stop_words = set(stopwords.words('english'))
    
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    return filtered_sentence
    

In [15]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: remove_stopwords(x))
train_limpio["text"] = train_limpio["text"].apply(lambda x: ' '.join(x))
train_limpio.head()
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,people receive wildfires evacuation orders cal...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


In [16]:
test_limpio["text"] = test_limpio["text"].apply(lambda x: remove_stopwords(x))
test_limpio["text"] = test_limpio["text"].apply(lambda x: ' '.join(x))
test_limpio.head()
test_limpio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
id          3263 non-null int64
keyword     3237 non-null object
location    2158 non-null object
text        3263 non-null object
dtypes: int64(1), object(3)
memory usage: 102.0+ KB


In [17]:
def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [18]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: stemming(x))
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquak may allah forgiv us,1
1,4,,,forest fire near la rong sask canada,1
2,5,,,resid ask shelter place notifi offic evacu she...,1
3,6,,,peopl receiv wildfir evacu order california,1
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,1


In [19]:
test_limpio["text"] = test_limpio["text"].apply(lambda x: stemming(x))
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,happen terribl car crash
1,2,,,heard earthquak differ citi stay safe everyon
2,3,,,forest fire spot pond gees flee across street ...
3,9,,,apocalyps light spokan wildfir
4,11,,,typhoon soudelor kill china taiwan


In [20]:
palabras_por_separado = train_limpio.copy()
palabras_por_separado.set_index(['id','target'],inplace=True)
palabras_por_separado = palabras_por_separado['text'].str.split(expand=True).stack().to_frame()
palabras_por_separado.columns = ['word']
palabras_por_separado.reset_index(inplace=True)
del palabras_por_separado['level_2']
palabras_por_separado.head()

Unnamed: 0,id,target,word
0,1,1,deed
1,1,1,reason
2,1,1,earthquak
3,1,1,may
4,1,1,allah


In [21]:
top = palabras_por_separado.groupby(['word']).agg({'target':'count'})
top_2 =top.loc[top["target"] < 5 ]
top_2.reset_index(inplace=True)
rare_words = top_2["word"].to_list()
len(rare_words)

11870

In [22]:
def remove_rare_words(texto):
    texto_spliteado = texto.split()
    filtered_sentence = [w for w in texto_spliteado if not w in rare_words]
    return ' '.join(filtered_sentence)

In [23]:
prueba = "deed hola que the como estas"
print(remove_rare_words(prueba))

hola the estas


In [24]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: remove_rare_words(x))
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,reason earthquak may allah us,1
1,4,,,forest fire near la canada,1
2,5,,,resid ask shelter place offic evacu shelter pl...,1
3,6,,,peopl wildfir evacu order california,1
4,7,,,got sent photo alaska smoke wildfir school,1


train_limpio_true = train_limpio.loc[train_limpio["target"] == 1]
train_limpio_false = train_limpio.loc[train_limpio["target"] == 0]
print("Tenemos {cant_verdaderos: .2f} verdaderos y {cant_falsos: .2f} falsos".format(cant_verdaderos=len(train_limpio_true), cant_falsos=len(train_limpio_false)))

train = pd.concat([train_limpio_true.iloc[:2616], train_limpio_false.iloc[:3473]])
val = pd.concat([train_limpio_true.iloc[2616:3000], train_limpio_false.iloc[3473:3850]])
test = pd.concat([train_limpio_true.iloc[3000:], train_limpio_false.iloc[3850:]])

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(train_limpio['text'].to_list())
dense = vectors.todense()
feature_names = vectorizer.get_feature_names()
new_train = pd.DataFrame(dense,columns = feature_names)
new_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 2394 entries, aba to zone
dtypes: float64(2394)
memory usage: 139.0 MB


In [26]:
vectors2 = vectorizer.transform(test_limpio['text'].to_list())
dense2 = vectors2.todense()
new_test = pd.DataFrame(dense2,columns= feature_names)
new_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Columns: 2394 entries, aba to zone
dtypes: float64(2394)
memory usage: 59.6 MB


In [27]:
new_test = new_test.drop(columns=['id'])

In [28]:
for i in train_limpio.columns:
    new_train[i]=train_limpio[i]


In [29]:
for i in test_limpio.columns:
    new_test[i]=test_limpio[i]

In [30]:
new_train = new_train.drop(columns = ['id','location','keyword','text'])
new_test = new_test.drop(columns=['location','keyword','text'])

In [31]:
new_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 2393 entries, aba to target
dtypes: float64(2392), int64(1)
memory usage: 139.0 MB


In [32]:
new_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Columns: 2393 entries, aba to id
dtypes: float64(2392), int64(1)
memory usage: 59.6 MB


In [33]:
train_limpio_true = new_train.loc[train_limpio["target"] == 1]
train_limpio_false = new_train.loc[train_limpio["target"] == 0]
print("Tenemos {cant_verdaderos: .2f} verdaderos y {cant_falsos: .2f} falsos".format(cant_verdaderos=len(train_limpio_true), cant_falsos=len(train_limpio_false)))

Tenemos  3271.00 verdaderos y  4342.00 falsos


In [34]:
train = pd.concat([train_limpio_true.iloc[:3000], train_limpio_false.iloc[:3850]])
val = pd.concat([train_limpio_true.iloc[3000:], train_limpio_false.iloc[3850:]])

In [35]:
X_train = train.drop(columns=['target'])
Y_train = train['target']
X_val = val.drop(columns=['target'])
Y_val = val['target']

In [36]:
model1 = Sequential()
model1.add(layers.Embedding(2392,32))
model1.add(layers.GlobalMaxPool1D())
model1.add(layers.Dense(10, activation='relu'))
model1.add(layers.Dense(1, activation='sigmoid'))

model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          76544     
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                330       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 76,885
Trainable params: 76,885
Non-trainable params: 0
_________________________________________________________________


In [37]:
first_history = model1.fit(X_train.to_numpy(), Y_train.to_numpy(),
                    epochs=5,
                    verbose=2,
                    validation_data=(X_val.to_numpy(), Y_val.to_numpy()),
                    batch_size=10)

Epoch 1/5
685/685 - 1s - loss: 0.6850 - acc: 0.5620 - val_loss: 0.6551 - val_acc: 0.6448
Epoch 2/5
685/685 - 1s - loss: 0.6839 - acc: 0.5620 - val_loss: 0.6574 - val_acc: 0.6448
Epoch 3/5
685/685 - 1s - loss: 0.6832 - acc: 0.5620 - val_loss: 0.6588 - val_acc: 0.6448
Epoch 4/5
685/685 - 1s - loss: 0.6836 - acc: 0.5620 - val_loss: 0.6592 - val_acc: 0.6448
Epoch 5/5
685/685 - 1s - loss: 0.6835 - acc: 0.5620 - val_loss: 0.6659 - val_acc: 0.6448


In [38]:
input_dim = 2392  # Number of features

model2 = Sequential()
model2.add(layers.Dense(2, input_dim=input_dim, activation='relu'))
model2.add(layers.Dense(1, activation='sigmoid'))

model2.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['acc'])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 2)                 4786      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 3         
Total params: 4,789
Trainable params: 4,789
Non-trainable params: 0
_________________________________________________________________


In [39]:
first_history = model2.fit(X_train.to_numpy(), Y_train.to_numpy(),
                    epochs=2,
                    verbose=2,
                    validation_data=(X_val.to_numpy(), Y_val.to_numpy()),
                    batch_size=10)

Epoch 1/2
685/685 - 1s - loss: 0.6526 - acc: 0.6498 - val_loss: 0.5982 - val_acc: 0.8218
Epoch 2/2
685/685 - 0s - loss: 0.5313 - acc: 0.8042 - val_loss: 0.5359 - val_acc: 0.7864


In [40]:
loss, accuracy = model2.evaluate(X_val.to_numpy(), Y_val.to_numpy(), verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.7864


In [41]:
firstPredictions = list(map(lambda x: x[0], model2.predict_classes(new_test.drop(columns=['id']).to_numpy())))
len(firstPredictions)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


3263

In [42]:
firstPredictionsDF = pd.DataFrame(data={'id': new_test['id'], 'target': firstPredictions})
firstPredictionsDF.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


new_train.to_csv('train_limpio_con_Tf-Idf.csv', index=False)
new_test.to_csv('test_limpio_con_Tf-Idf.csv', index=False)


In [43]:
firstPredictionsDF.to_csv('Tfidf_firstPredictions.csv', index=False)

In [44]:
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,reason earthquak may allah us,1
1,4,,,forest fire near la canada,1
2,5,,,resid ask shelter place offic evacu shelter pl...,1
3,6,,,peopl wildfir evacu order california,1
4,7,,,got sent photo alaska smoke wildfir school,1


In [45]:
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,happen terribl car crash
1,2,,,heard earthquak differ citi stay safe everyon
2,3,,,forest fire spot pond gees flee across street ...
3,9,,,apocalyps light spokan wildfir
4,11,,,typhoon soudelor kill china taiwan


In [46]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(train_limpio['text'].to_list())
dense = vectors.todense()
feature_names = vectorizer.get_feature_names()
train_embedding = pd.DataFrame(dense,columns = feature_names)
train_embedding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 2394 entries, aba to zone
dtypes: int64(2394)
memory usage: 139.0 MB


In [47]:
vectors2 = vectorizer.transform(test_limpio['text'].to_list())
dense2 = vectors2.todense()
test_embedding = pd.DataFrame(dense2,columns= feature_names)
test_embedding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Columns: 2394 entries, aba to zone
dtypes: int64(2394)
memory usage: 59.6 MB


In [48]:
test_embedding = test_embedding.drop(columns=['id'])

In [49]:
for i in train_limpio.columns:
    train_embedding[i]=train_limpio[i]

In [50]:
for i in test_limpio.columns:
    test_embedding[i]=test_limpio[i]

In [51]:
train_embedding = train_embedding.drop(columns = ['id','location','keyword','text'])
test_embedding = test_embedding.drop(columns=['location','keyword','text'])

In [52]:
train_limpio_true = train_embedding.loc[train_limpio["target"] == 1]
train_limpio_false = train_embedding.loc[train_limpio["target"] == 0]
print("Tenemos {cant_verdaderos: .2f} verdaderos y {cant_falsos: .2f} falsos".format(cant_verdaderos=len(train_limpio_true), cant_falsos=len(train_limpio_false)))

Tenemos  3271.00 verdaderos y  4342.00 falsos


In [53]:
train = pd.concat([train_limpio_true.iloc[:3000], train_limpio_false.iloc[:3850]])
val = pd.concat([train_limpio_true.iloc[3000:], train_limpio_false.iloc[3850:]])

In [54]:
X_train = train.drop(columns=['target'])
Y_train = train['target']
X_val = val.drop(columns=['target'])
Y_val = val['target']

In [62]:

model3 = Sequential()
model3.add(layers.Embedding(2392,40))
model3.add(layers.GlobalMaxPool1D())
model3.add(layers.Dense(10, activation='sigmoid'))
model3.add(layers.Dense(1, activation='sigmoid'))

model3.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
model3.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 40)          95680     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 40)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 10)                410       
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 11        
Total params: 96,101
Trainable params: 96,101
Non-trainable params: 0
_________________________________________________________________


In [63]:
first_history = model3.fit(X_train.to_numpy(), Y_train.to_numpy(),
                    epochs=20,
                    verbose=2,
                    validation_data=(X_val.to_numpy(), Y_val.to_numpy()),
                    batch_size=10)

Epoch 1/20
685/685 - 2s - loss: 0.7096 - acc: 0.5384 - val_loss: 0.6575 - val_acc: 0.6435
Epoch 2/20
685/685 - 1s - loss: 0.6860 - acc: 0.5613 - val_loss: 0.6686 - val_acc: 0.6448
Epoch 3/20
685/685 - 1s - loss: 0.6861 - acc: 0.5615 - val_loss: 0.6696 - val_acc: 0.6448
Epoch 4/20
685/685 - 1s - loss: 0.6852 - acc: 0.5625 - val_loss: 0.6577 - val_acc: 0.6448
Epoch 5/20
685/685 - 2s - loss: 0.6856 - acc: 0.5616 - val_loss: 0.6826 - val_acc: 0.6107
Epoch 6/20
685/685 - 1s - loss: 0.6857 - acc: 0.5616 - val_loss: 0.6699 - val_acc: 0.6448
Epoch 7/20
685/685 - 2s - loss: 0.6856 - acc: 0.5600 - val_loss: 0.6607 - val_acc: 0.6448
Epoch 8/20
685/685 - 1s - loss: 0.6853 - acc: 0.5609 - val_loss: 0.6597 - val_acc: 0.6448
Epoch 9/20
685/685 - 1s - loss: 0.6854 - acc: 0.5613 - val_loss: 0.6578 - val_acc: 0.6448
Epoch 10/20
685/685 - 1s - loss: 0.6853 - acc: 0.5625 - val_loss: 0.6703 - val_acc: 0.6448
Epoch 11/20
685/685 - 2s - loss: 0.6852 - acc: 0.5635 - val_loss: 0.6580 - val_acc: 0.6448
Epoch 12