In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [42]:
from tensorflow import keras

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout

In [3]:
from sklearn import metrics


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [7]:
sample_submission = pd.read_csv("sample_submission.csv")

# Limpeza dos dados de entrada

In [8]:
sample_tweets_train=train_df['text']
sample_tweets_test=test_df['text']

In [9]:
def limpeza(tweets):
    tweets = tweets.str.replace(r'@[a-zA-Z0-9_!.]{0,25}','')
    tweets = tweets.str.replace('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/(?:[-\w.]|(?:%[\da-fA-F]{2}))+','')
    tweets = tweets.str.replace(r'[0-9]','')
    tweets = tweets.replace("\\x89Û_",'')
    tweets = tweets.replace('\x89ÛÓ','')
    tweets = tweets.replace('\x89ÛÓ','')
    tweets = tweets.replace('\x89ÛÒ','')
    tweets = tweets.replace('\x89Û','')
    tweets = tweets.replace('\x89Û÷','')
    tweets = tweets.replace('\x89ûï','')
    #retirada de símbolos não alfanuméticos 
    tweets = tweets.str.replace(r"[#,.;:_?!()\[\]]","")
    tweets = tweets.str.lower()
    return tweets

In [10]:
sample_tweets_train = limpeza(sample_tweets_train)
sample_tweets_test = limpeza(sample_tweets_test)

  tweets = tweets.str.replace(r'@[a-zA-Z0-9_!.]{0,25}','')
  tweets = tweets.str.replace('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/(?:[-\w.]|(?:%[\da-fA-F]{2}))+','')
  tweets = tweets.str.replace(r'[0-9]','')
  tweets = tweets.str.replace(r"[#,.;:_?!()\[\]]","")


In [11]:
stop_words = set(stopwords.words("english"))
def lematizar(tweets):
    sample_tweets= []
    for tweet in tweets:
        filtered_list = []
        words_lemmatized = []
        words = []
        words = word_tokenize(tweet) 
        words_lemmatized = [lemmatizer.lemmatize(word) for word in words] 
        for word in words_lemmatized:
            if word not in stop_words:
                filtered_list.append(word)
        sample_tweets.append(filtered_list)
    return sample_tweets 

In [12]:
sample_tweets_train = lematizar(sample_tweets_train)
sample_tweets_test = lematizar(sample_tweets_test)

In [13]:
train_df['words']= sample_tweets_train
test_df['words']= sample_tweets_test
test_df = test_df.drop(columns=['location', 'keyword','id','text'])
train_df = train_df.drop(columns=['location', 'keyword','id','text'])
train_df['text'] = train_df['words'].apply(lambda x: ' '.join([str(elem) for elem in x]))
test_df['text'] = test_df['words'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [14]:
train_df.head()

Unnamed: 0,target,words,text
0,1,"[deed, reason, earthquake, may, allah, forgive...",deed reason earthquake may allah forgive u
1,1,"[forest, fire, near, la, ronge, sask, canada]",forest fire near la ronge sask canada
2,1,"[resident, asked, 'shelter, place, ', notified...",resident asked 'shelter place ' notified offic...
3,1,"[people, receive, wildfire, evacuation, order,...",people receive wildfire evacuation order calif...
4,1,"[got, sent, photo, ruby, alaska, smoke, wildfi...",got sent photo ruby alaska smoke wildfire pour...


In [15]:
test_df.head()

Unnamed: 0,words,text
0,"[happened, terrible, car, crash]",happened terrible car crash
1,"[heard, earthquake, different, city, stay, saf...",heard earthquake different city stay safe ever...
2,"[forest, fire, spot, pond, goose, fleeing, acr...",forest fire spot pond goose fleeing across str...
3,"[apocalypse, lighting, spokane, wildfire]",apocalypse lighting spokane wildfire
4,"[typhoon, soudelor, kill, china, taiwan]",typhoon soudelor kill china taiwan


# Tokenizar: Pad_sequence()

In [16]:
tfidf_vectorizer = TfidfVectorizer()
count_treino_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
count_teste_tfidf = tfidf_vectorizer.transform(test_df['text'])

In [28]:
max_features=5000
tokenizer_train=Tokenizer(num_words=max_features,split=' ')
tokenizer_test=Tokenizer(num_words=max_features,split=' ')
tokenizer_train.fit_on_texts(train_df['text'])
tokenizer_test.fit_on_texts(test_df['text'])
X = tokenizer_train.texts_to_sequences(train_df['text'])
X_test = tokenizer_test.texts_to_sequences(test_df['text'])
X_test = pad_sequences(X_test)
X = pad_sequences(X)

In [29]:
y = train_df['target']

In [38]:
X.shape


(7613, 22)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state =41)

# LSTM

In [31]:
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

In [40]:
embed_dim = 100
lstm_out = 100
model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(Dropout(0.2))
model.add(LSTM(lstm_out, dropout=0.2, return_sequences=True,recurrent_dropout=0.4))
model.add(Dropout(0.2))
model.add(LSTM(lstm_out,dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
adam = optimizers.Adam(learning_rate=2e-3)
model.compile(loss = 'binary_crossentropy', optimizer=adam ,metrics = ['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 22, 100)           500000    
                                                                 
 dropout_4 (Dropout)         (None, 22, 100)           0         
                                                                 
 lstm_4 (LSTM)               (None, 22, 100)           80400     
                                                                 
 dropout_5 (Dropout)         (None, 22, 100)           0         
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dropout_6 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                

In [43]:
es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [44]:
model.fit(X,y, epochs = 10,validation_split = 0.2 ,callbacks=[es_callback], batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x165857f21c0>

In [45]:
y_pred = model.predict(X_test).round()



# Avaliação

In [47]:
train_accuracy = round(metrics.accuracy_score(y_train,model.predict(X_train).round())*100)
train_accuracy



90

In [48]:
print('Accuracy  is  : ', (metrics.accuracy_score(y_test, y_pred)))
print('Recall  is    : ', (metrics.recall_score(y_test, y_pred)))
print('Precision  is : ', (metrics.precision_score(y_test, y_pred)))

Accuracy  is  :  0.9093893630991464
Recall  is    :  0.8628048780487805
Precision  is :  0.9218241042345277


# Submissão 

In [27]:
#sample_submission.to_csv("submission.csv", index=False)

# Notas:

Agora que eu tenho um modelo funcional de LSTM, tenho que modificar e entender melhor algumas coisas.

Entender:
- Como achar o tamanho certo de cada vetor de entrada e saída.
- Como saber o tamanho do vetor
- Como saber a quantidade ideal de camadas de LSTM
- Dropout

Coisas para o Vinicios explicar:
- Optimizers
- Camadas de uma rede neural (Dropout, Dense, input, output, camada oculta)
- Treinamento
- Gradiente...