In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB

from sklearn import model_selection

import io
import string


In [2]:
#import file of training ==> almost balanced dataset

metrics_results = {}

training_df = pd.read_csv("train.csv")

size_of_training = len(training_df.index)

classlabel_count = training_df.target.value_counts()
print('Class 0:', classlabel_count[0])
print('Class 1:', classlabel_count[1])
print('Proportion:', round(classlabel_count[0] / classlabel_count[1], 2), ': 1')

Class 0: 4342
Class 1: 3271
Proportion: 1.33 : 1


In [3]:
test_df = pd.read_csv("test.csv")

len(test_df.index)

3263

In [4]:
# Load language and make functions to help to prepare
nlp_lemma = spacy.load('en_core_web_sm')

# Convert to array
def convert_text_to_array(text):
    text_converted = nlp_lemma(text)
    
    return [each_word for each_word in text_converted]

# Get token and lemmatization
def get_token_and_lemma(text):
    return [each_word.lemma_ for each_word in text]

# remove stop words and not users citation
def remove_stop_words(tokens):
    return [each_word for each_word in tokens \
            if not nlp_lemma.vocab[each_word].is_stop \
            and nlp_lemma.vocab[each_word].is_alpha \
            and each_word[:1] != "@"]

def fit_and_make_submission_file(train_vectors, test_vectors, target, clf, filename):
    clf.fit(train_vectors, target)
    sample_submission = pd.read_csv("sample_submission.csv")
    sample_submission["target"] = clf.predict(test_vectors)
    sample_submission.to_csv("submission_files/"+filename, index=False)

In [5]:
# get tokens in the text and remove stop words
training_df['tokens'] = training_df['text'].apply(lambda x: convert_text_to_array(x))
training_df['tokens'] = training_df['tokens'].apply(lambda x: get_token_and_lemma(x))
training_df['tokens'] = training_df['tokens'].apply(lambda x: remove_stop_words(x))

test_df['tokens'] = test_df['text'].apply(lambda x: convert_text_to_array(x))
test_df['tokens'] = test_df['tokens'].apply(lambda x: get_token_and_lemma(x))
test_df['tokens'] = test_df['tokens'].apply(lambda x: remove_stop_words(x))


In [6]:
training_df.head()

Unnamed: 0,id,keyword,location,text,target,tokens
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deed, Reason, earthquake, ALLAH, forgive]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, La, Ronge, Sask, Canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[resident, ask, shelter, place, notify, office..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfire, evacuation, order,..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, send, photo, Ruby, Alaska, smoke, wildfi..."


In [7]:
# Vetorizar utilizando bag of words de 1 palavra

def dummy(doc):
    return doc

metrics_results = {}

vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy, ngram_range=(1,1))

train_vectors = vectorizer.fit_transform(training_df["tokens"])
test_vectors = vectorizer.transform(test_df["tokens"])

In [8]:
# Kaggle - Score 0.77903 - bag of words - ngram(1,1)
# Cross validation - F1 score - 0.5671622153365451 +/- 0.06033706267666568
# Cross validation - F1 score - Max - 0.68197474

clf = RidgeClassifier()

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["bow_1x1_ridge"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "bow_1x1_ridge.csv")

In [9]:
clf = SGDClassifier(max_iter=1000, tol=10)

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["bow_1x1_sgdc"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "bow_1x1_sgdc.csv")
print(scores.mean())

0.5804462053066946


In [10]:
# Cross validation - F1 score - 0.5930793845835153 +/- 0.056694945830674255

clf = SVC(gamma=0.01, C=10)

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["bow_1x1_svc"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "bow_1x1_svc.csv")


In [11]:
# Vetorizar utilizando tfidf

vectorizer = TfidfVectorizer(tokenizer=dummy, preprocessor=dummy)

train_vectors = vectorizer.fit_transform(training_df["tokens"])
test_vectors = vectorizer.transform(test_df["tokens"])


In [12]:
# Cross validation - F1 score - 0.5767244438742305 +/- 0.06304273823463757
# Cross validation - F1 score - Max - 0.70842825

clf = RidgeClassifier()

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")
print(scores)
print(str(scores.mean())  + " +/- " + str(scores.std()))

metrics_results["tfidf_ridge"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "tfidf_ridge.csv")

[0.63878327 0.53802497 0.53488372 0.55529954 0.54681648 0.54100367
 0.69634703]
0.5787369550029594 +/- 0.058566557673596066


In [13]:
clf = SGDClassifier(max_iter=1000, tol=10)

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["tfidf_sgdc"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "tfidf_sgdc.csv")
print(scores.mean())

0.5907146866067939


In [14]:
clf = SVC(gamma=0.01, C=10)

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["tfidf_svc"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "tfidf_svc.csv")

In [15]:

clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', n_jobs=-1, random_state=40)

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["tfidf_logistic"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "tfidf_logistic.csv")
print(scores.mean())

0.5934339539170913


In [16]:

clf = LinearSVC()

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["tfidf_linearsvc"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "tfidf_linearsvc.csv")

In [17]:
vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy, ngram_range=(1,2))

train_vectors = vectorizer.fit_transform(training_df["tokens"])
test_vectors = vectorizer.transform(test_df["tokens"])

In [18]:

clf = RidgeClassifier()

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["bow_1x2_ridge"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "bow_1x2_ridge.csv")

In [19]:

clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', n_jobs=-1, random_state=40)

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["bow_1x2_logistic"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "bow_1x2_logistic.csv")

In [20]:

clf = LinearSVC()

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["bow_1x2_linearsvc"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "bow_1x2_linearsvc.csv")

In [21]:
clf = NuSVC()

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["bow_1x2_nusvc"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "bow_1x2_nusvc.csv")

In [22]:

clf = SGDClassifier(max_iter=1000, tol=1000000)

scores = model_selection.cross_val_score(clf, train_vectors, training_df["target"], cv=7, scoring="f1")

metrics_results["bow_1x2_sgdc"] = {"mean": scores.mean(), "std": scores.std(), "max" : scores.max()}

fit_and_make_submission_file(train_vectors, test_vectors, training_df["target"], clf, "bow_1x2_sgdc.csv")


In [23]:
pd.DataFrame.from_records(metrics_results).transpose().sort_values("mean", ascending=False)

Unnamed: 0,mean,std,max
tfidf_logistic,0.593434,0.05036,0.687169
bow_1x1_svc,0.592198,0.058246,0.709898
tfidf_sgdc,0.590715,0.055751,0.701604
tfidf_linearsvc,0.585003,0.055227,0.698554
bow_1x2_logistic,0.582876,0.058849,0.695556
bow_1x1_sgdc,0.580446,0.056624,0.693092
tfidf_ridge,0.578737,0.058567,0.696347
bow_1x2_sgdc,0.576534,0.07112,0.69863
bow_1x2_linearsvc,0.566132,0.057814,0.668224
bow_1x1_ridge,0.565406,0.061901,0.665148


In [24]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import gensim.downloader as api

# Load Google's pre-trained Word2Vec model.
word2vec = api.load("word2vec-google-news-300")

all_words = [word for tokens in training_df["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in training_df["tokens"]]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

59230 words total, with a vocabulary size of 14387
Max sentence length is 21


In [25]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 35
VOCAB_SIZE = len(VOCAB)

VALIDATION_SPLIT=.2
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(training_df["text"].tolist())
sequences = tokenizer.texts_to_sequences(training_df["text"].tolist())

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

cnn_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(training_df["target"]))

indices = np.arange(cnn_data.shape[0])
np.random.shuffle(indices)
cnn_data = cnn_data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * cnn_data.shape[0])

embedding_weights = np.zeros((len(word_index)+1, EMBEDDING_DIM))
for word,index in word_index.items():
    embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(embedding_weights.shape)

Found 22700 unique tokens.
(22701, 300)


In [26]:
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=trainable)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)  
    else:
        # Original Yoon Kim model
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    #x = Dropout(0.5)(x)

    preds = Dense(labels_index, activation='softmax')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])

    return model

In [27]:
x_train = cnn_data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = cnn_data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

model = ConvNet(embedding_weights, MAX_SEQUENCE_LENGTH, len(word_index)+1, EMBEDDING_DIM, 
                len(list(training_df["target"].unique())), False)

In [28]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x252b05d50>

In [29]:
sequences = tokenizer.texts_to_sequences(test_df["text"].tolist())
cnn_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
results = model.predict(cnn_data).tolist()
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = [ 0 if i[0] > i[1] else 1 for i in results]
sample_submission.to_csv("submission_files/cnn_results.csv", index=False)
