In [4]:
import pandas as pd
import numpy as np
import re
import string
from textblob import TextBlob
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 


from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [5]:
stemmer = SnowballStemmer("english")

In [6]:
pd.set_option('display.max_colwidth', 50)

In [7]:
train = pd.read_csv("train.csv",encoding = "ISO-8859-1")
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
test = pd.read_csv("test.csv",encoding = "ISO-8859-1")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
id          3263 non-null int64
keyword     3237 non-null object
location    2158 non-null object
text        3263 non-null object
dtypes: int64(1), object(3)
memory usage: 102.0+ KB


COMENZAMOS LA LIMPIEZA DE LOS DATOS

In [9]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    # replacing the punctuations with no space.
    # which in effect deletes the punctuation marks.
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks.
    return text.translate(translator)


In [10]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()  #quito_mayusculas
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  #quito caracteres inusuales
    text = re.sub('â' , '', text)  #quito caracteres inusuales
    text = re.sub('ª' , '', text)  #quito caracteres inusuales
    text = re.sub('ã' , '', text)  #quito caracteres inusuales
    text = re.sub('¼' , '', text)  #quito caracteres inusuales
    text = re.sub('target' , '', text)  #quito caracteres inusuales
    text = re.sub('\w*\d\w*', '', text)  # quito numeros
    return text

In [11]:
train_limpio = train.copy()
test_limpio = test.copy()

In [12]:
test_limpio['text'] = test_limpio['text'].apply(lambda x: clean_text(x))
train_limpio['text'] = train_limpio['text'].apply(lambda x: clean_text(x))

In [13]:
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,heard about earthquake is different cities sta...
2,3,,,there is a forest fire at spot pond geese are ...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills in china and taiwan


In [14]:
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,people receive wildfires evacuation orders in...,1
4,7,,,just got sent this photo from ruby alaska as s...,1


TOKENIZAMOS EL TEXTO (SEPARAMOS POR PALABRAS)

In [15]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: word_tokenize(x))
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deeds, are, the, reason, of, this, earth...",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[all, residents, asked, to, shelter, in, place...",1
3,6,,,"[people, receive, wildfires, evacuation, order...",1
4,7,,,"[just, got, sent, this, photo, from, ruby, ala...",1


In [16]:
test_limpio["text"] = test_limpio["text"].apply(lambda x: word_tokenize(x))
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,"[just, happened, a, terrible, car, crash]"
1,2,,,"[heard, about, earthquake, is, different, citi..."
2,3,,,"[there, is, a, forest, fire, at, spot, pond, g..."
3,9,,,"[apocalypse, lighting, spokane, wildfires]"
4,11,,,"[typhoon, soudelor, kills, in, china, and, tai..."


REMOVEMOS LAS PALABRAS HABITUALES DEL LENGUAJE INGLES (STOPWORDS)

In [17]:
def remove_stopwords(word_tokens):
    stop_words = set(stopwords.words('english'))
    
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    return filtered_sentence
    

In [18]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: remove_stopwords(x))
train_limpio["text"] = train_limpio["text"].apply(lambda x: ' '.join(x))
train_limpio.head()
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,people receive wildfires evacuation orders cal...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


In [19]:
test_limpio["text"] = test_limpio["text"].apply(lambda x: remove_stopwords(x))
test_limpio["text"] = test_limpio["text"].apply(lambda x: ' '.join(x))
test_limpio.head()
test_limpio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
id          3263 non-null int64
keyword     3237 non-null object
location    2158 non-null object
text        3263 non-null object
dtypes: int64(1), object(3)
memory usage: 102.0+ KB


APLICAMOS STEMMING AL TEXTO PARA REDUCIR EL TAMAÑO DE LAS PALABRAS Y OBTENER MAS REPETICIONES DE LAS PALABRAS

In [20]:
def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [21]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: stemming(x))
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquak may allah forgiv us,1
1,4,,,forest fire near la rong sask canada,1
2,5,,,resid ask shelter place notifi offic evacu she...,1
3,6,,,peopl receiv wildfir evacu order california,1
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,1


In [22]:
test_limpio["text"] = test_limpio["text"].apply(lambda x: stemming(x))
test_limpio.head()

Unnamed: 0,id,keyword,location,text
0,0,,,happen terribl car crash
1,2,,,heard earthquak differ citi stay safe everyon
2,3,,,forest fire spot pond gees flee across street ...
3,9,,,apocalyps light spokan wildfir
4,11,,,typhoon soudelor kill china taiwan


BUSCO LAS PALABRAS MAS RARAS PARA ELIMINARLAS, DADO QUE SI TIENEN MUY POCAS REPETICIONES NO NOS SERVIRAN PARA NUESTRO MODELO

In [23]:
palabras_por_separado = train_limpio.copy()
palabras_por_separado.set_index(['id','target'],inplace=True)
palabras_por_separado = palabras_por_separado['text'].str.split(expand=True).stack().to_frame()
palabras_por_separado.columns = ['word']
palabras_por_separado.reset_index(inplace=True)
del palabras_por_separado['level_2']
palabras_por_separado.head()

Unnamed: 0,id,target,word
0,1,1,deed
1,1,1,reason
2,1,1,earthquak
3,1,1,may
4,1,1,allah


In [24]:
top = palabras_por_separado.groupby(['word']).agg({'target':'count'})
top_2 =top.loc[top["target"] < 5 ]
top_2.reset_index(inplace=True)
rare_words = top_2["word"].to_list()
len(rare_words)

11870

ELIMINO LAS PALABRAS MAS RARAS CONSIDERANDO QUE UNA PALABRA RARA ES AQUELLA QUE APAREZCA MENOS DE 5 VECES

In [25]:
def remove_rare_words(texto):
    texto_spliteado = texto.split()
    filtered_sentence = [w for w in texto_spliteado if not w in rare_words]
    return ' '.join(filtered_sentence)

In [26]:
prueba = "deed hola que the como estas"
print(remove_rare_words(prueba))

hola the estas


In [27]:
train_limpio["text"] = train_limpio["text"].apply(lambda x: remove_rare_words(x))
train_limpio.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,reason earthquak may allah us,1
1,4,,,forest fire near la canada,1
2,5,,,resid ask shelter place offic evacu shelter pl...,1
3,6,,,peopl wildfir evacu order california,1
4,7,,,got sent photo alaska smoke wildfir school,1


train_limpio_true = train_limpio.loc[train_limpio["target"] == 1]
train_limpio_false = train_limpio.loc[train_limpio["target"] == 0]
print("Tenemos {cant_verdaderos: .2f} verdaderos y {cant_falsos: .2f} falsos".format(cant_verdaderos=len(train_limpio_true), cant_falsos=len(train_limpio_false)))

train = pd.concat([train_limpio_true.iloc[:2616], train_limpio_false.iloc[:3473]])
val = pd.concat([train_limpio_true.iloc[2616:3000], train_limpio_false.iloc[3473:3850]])
test = pd.concat([train_limpio_true.iloc[3000:], train_limpio_false.iloc[3850:]])

APLICO TFIDF AL TEXTO 

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(train_limpio['text'].to_list())
dense = vectors.todense()
feature_names = vectorizer.get_feature_names()
new_train = pd.DataFrame(dense,columns = feature_names)
new_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 2394 entries, aba to zone
dtypes: float64(2394)
memory usage: 139.0 MB


In [29]:
vectors2 = vectorizer.transform(test_limpio['text'].to_list())
dense2 = vectors2.todense()
new_test = pd.DataFrame(dense2,columns= feature_names)
new_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Columns: 2394 entries, aba to zone
dtypes: float64(2394)
memory usage: 59.6 MB


In [30]:
new_test = new_test.drop(columns=['id'])

In [31]:
for i in train_limpio.columns:
    new_train[i]=train_limpio[i]


In [32]:
for i in test_limpio.columns:
    new_test[i]=test_limpio[i]

In [33]:
new_train = new_train.drop(columns = ['id','location','keyword','text'])
new_test = new_test.drop(columns=['location','keyword','text'])

In [34]:
new_train.head()

Unnamed: 0,aba,abandon,abc,abcnew,abl,ablaz,absolut,abstorm,abus,access,...,youth,youtub,youv,yr,yrs,yyc,zionist,zombi,zone,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [35]:
new_test.head()

Unnamed: 0,aba,abandon,abc,abcnew,abl,ablaz,absolut,abstorm,abus,access,...,youth,youtub,youv,yr,yrs,yyc,zionist,zombi,zone,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11


In [None]:
new_train.to_csv('train_limpio_con_Tf-Idf.csv', index=False)
new_test.to_csv('test_limpio_con_Tf-Idf.csv', index=False)