In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import string
from textblob import TextBlob
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
import nltk

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [40]:
stemmer = SnowballStemmer("english")

In [41]:
pd.set_option('display.max_colwidth', 50)

In [42]:
train = pd.read_csv("train.csv",encoding = "ISO-8859-1")
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [43]:
test = pd.read_csv("test.csv",encoding = "ISO-8859-1")
test.head(5)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [44]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    # replacing the punctuations with no space.
    # which in effect deletes the punctuation marks.
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks.
    return text.translate(translator)


In [45]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()  #quito_mayusculas
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  #quito caracteres inusuales
    text = re.sub('â' , '', text)  #quito caracteres inusuales
    text = re.sub('ª' , '', text)  #quito caracteres inusuales
    text = re.sub('ã' , '', text)  #quito caracteres inusuales
    text = re.sub('¼' , '', text)  #quito caracteres inusuales
    text = re.sub('target' , '', text)  #quito caracteres inusuales
    text = re.sub('\w*\d\w*', '', text)  # quito numeros
    return text

In [46]:
train_limpio = train.copy()
test_limpio = test.copy()

In [47]:
test_limpio['text'] = test_limpio['text'].apply(lambda x: clean_text(x))
train_limpio['text'] = train_limpio['text'].apply(lambda x: clean_text(x))

In [48]:
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,heard about earthquake is different cities sta...
2,3,,,there is a forest fire at spot pond geese are ...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills in china and taiwan


In [49]:
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,people receive wildfires evacuation orders in...,1
4,7,,,just got sent this photo from ruby alaska as s...,1


In [50]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: word_tokenize(x))
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deeds, are, the, reason, of, this, earth...",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[all, residents, asked, to, shelter, in, place...",1
3,6,,,"[people, receive, wildfires, evacuation, order...",1
4,7,,,"[just, got, sent, this, photo, from, ruby, ala...",1


In [51]:
test_limpio["text"] = test_limpio["text"].apply(lambda x: word_tokenize(x))
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,"[just, happened, a, terrible, car, crash]"
1,2,,,"[heard, about, earthquake, is, different, citi..."
2,3,,,"[there, is, a, forest, fire, at, spot, pond, g..."
3,9,,,"[apocalypse, lighting, spokane, wildfires]"
4,11,,,"[typhoon, soudelor, kills, in, china, and, tai..."


In [52]:
def remove_stopwords(word_tokens):
    stop_words = set(stopwords.words('english'))
    
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    return filtered_sentence
    

In [53]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: remove_stopwords(x))
train_limpio["text"] = train_limpio["text"].apply(lambda x: ' '.join(x))
train_limpio.head()
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,people receive wildfires evacuation orders cal...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


In [54]:
test_limpio["text"] = test_limpio["text"].apply(lambda x: remove_stopwords(x))
test_limpio["text"] = test_limpio["text"].apply(lambda x: ' '.join(x))
test_limpio.head()
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,happened terrible car crash
1,2,,,heard earthquake different cities stay safe ev...
2,3,,,forest fire spot pond geese fleeing across str...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills china taiwan


In [55]:
def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [56]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: stemming(x))
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquak may allah forgiv us,1
1,4,,,forest fire near la rong sask canada,1
2,5,,,resid ask shelter place notifi offic evacu she...,1
3,6,,,peopl receiv wildfir evacu order california,1
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,1


In [57]:
test_limpio["text"] = test_limpio["text"].apply(lambda x: stemming(x))
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,happen terribl car crash
1,2,,,heard earthquak differ citi stay safe everyon
2,3,,,forest fire spot pond gees flee across street ...
3,9,,,apocalyps light spokan wildfir
4,11,,,typhoon soudelor kill china taiwan


In [58]:
palabras_por_separado = train_limpio.copy()
palabras_por_separado.set_index(['id','target'],inplace=True)
palabras_por_separado = palabras_por_separado['text'].str.split(expand=True).stack().to_frame()
palabras_por_separado.columns = ['word']
palabras_por_separado.reset_index(inplace=True)
del palabras_por_separado['level_2']
palabras_por_separado.head()

Unnamed: 0,id,target,word
0,1,1,deed
1,1,1,reason
2,1,1,earthquak
3,1,1,may
4,1,1,allah


In [59]:
top = palabras_por_separado.groupby(['word']).agg({'target':'count'})
top_2 =top.loc[top["target"] < 5 ]
top_2.reset_index(inplace=True)
rare_words = top_2["word"].to_list()
len(rare_words)

11870

In [60]:
def remove_rare_words(texto):
    texto_spliteado = texto.split()
    filtered_sentence = [w for w in texto_spliteado if not w in rare_words]
    return ' '.join(filtered_sentence)

In [61]:
prueba = "deed hola que the como estas"
print(remove_rare_words(prueba))

hola the estas


In [62]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: remove_rare_words(x))
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,reason earthquak may allah us,1
1,4,,,forest fire near la canada,1
2,5,,,resid ask shelter place offic evacu shelter pl...,1
3,6,,,peopl wildfir evacu order california,1
4,7,,,got sent photo alaska smoke wildfir school,1


In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(train_limpio['text'].to_list())
dense = vectors.todense()
feature_names = vectorizer.get_feature_names()
new_train = pd.DataFrame(dense,columns = feature_names)
new_train.head()

Unnamed: 0,aba,abandon,abc,abcnew,abl,ablaz,absolut,abstorm,abus,access,...,your,youth,youtub,youv,yr,yrs,yyc,zionist,zombi,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
vectors2 = vectorizer.transform(test_limpio['text'].to_list())
dense2 = vectors.todense()
new_test = pd.DataFrame(dense2,columns= feature_names)
new_test

Unnamed: 0,aba,abandon,abc,abcnew,abl,ablaz,absolut,abstorm,abus,access,...,your,youth,youtub,youv,yr,yrs,yyc,zionist,zombi,zone
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7609,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7610,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7611,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
for i in train_limpio.columns:
    new_train[i]=train_limpio[i]


In [66]:
new_train = new_train.drop(columns=['text','keyword','location'])

In [67]:
new_test = new_test.drop(columns=['text'])

In [68]:
new_train

Unnamed: 0,aba,abandon,abc,abcnew,abl,ablaz,absolut,abstorm,abus,access,...,youth,youtub,youv,yr,yrs,yyc,zionist,zombi,zone,target
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7609,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7610,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7611,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [69]:
new_train.to_csv('train_limpio_con_Tf-Idf.csv', index=False)
new_test.to_csv('test_limpio_con_Tf-Idf.csv', index=False)
