In [1]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import random
import re
import nltk
import operator

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')


In [2]:
train_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/train_super_featured.csv')
test_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/test_super_featured.csv')


In [3]:
train_data['keyword_original'].fillna('no_keyword', inplace=True)
test_data['keyword_original'].fillna('no_keyword', inplace=True)
train_data['location_original'].fillna('no_location', inplace=True)
test_data['location_original'].fillna('no_location', inplace=True)

In [4]:
train_data['text_original'].fillna('', inplace=True)
train_data['clean_text'].fillna('', inplace=True)
train_data['super_clean_text'].fillna('', inplace=True)
train_data['kaggle_text'].fillna('', inplace=True)
train_data['semi_cleaned_text'].fillna('', inplace=True)
test_data['text_original'].fillna('', inplace=True)
test_data['clean_text'].fillna('', inplace=True)
test_data['super_clean_text'].fillna('', inplace=True)
test_data['kaggle_text'].fillna('', inplace=True)
test_data['semi_cleaned_text'].fillna('', inplace=True)

In [5]:
import gensim, logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import time

from gensim.models.word2vec import Word2Vec
from glob import glob

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Reshape, Flatten, Dropout, Concatenate
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.constraints import max_norm

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

In [8]:
tokenizer = Tokenizer()            
tokenizer.fit_on_texts(train_data['super_clean_text'].values.tolist())
tokenizer

<keras_preprocessing.text.Tokenizer at 0x12cdbae48>

In [9]:
glove_embeddings = {}
with open('../../../DataSets/TP2/glove.840B.300d.txt', encoding='UTF-8') as f:
    for line in f:
        values = line.replace("\n", "").split(" ")
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vec

In [10]:
def build_vocab(X):
    
    tweets = X.apply(lambda s: s.split()).values      
    vocab = {}
    
    for tweet in tweets:
        for word in tweet:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1                
    return vocab


def check_embeddings_coverage(X, embeddings):
    
    vocab = build_vocab(X)    
    
    covered = {}
    oov = {}    
    n_covered = 0
    n_oov = 0
    
    for word in vocab:
        try:
            covered[word] = embeddings[word]
            n_covered += vocab[word]
        except:
            oov[word] = vocab[word]
            n_oov += vocab[word]
            
    vocab_coverage = len(covered) / len(vocab)
    text_coverage = (n_covered / (n_covered + n_oov))
    
    sorted_oov = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_oov, vocab_coverage, text_coverage

In [11]:
train_glove_oov, train_glove_vocab_coverage, train_glove_text_coverage = check_embeddings_coverage(train_data['text_original'], glove_embeddings)
test_glove_oov, test_glove_vocab_coverage, test_glove_text_coverage = check_embeddings_coverage(test_data['text_original'], glove_embeddings)
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set text_original'.format(train_glove_vocab_coverage, train_glove_text_coverage))
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Test Set text_original'.format(test_glove_vocab_coverage, test_glove_text_coverage))

GloVe Embeddings cover 51.90% of vocabulary and 81.71% of text in Training Set text_original
GloVe Embeddings cover 57.12% of vocabulary and 81.00% of text in Test Set text_original


In [12]:
train_glove_oov, train_glove_vocab_coverage, train_glove_text_coverage = check_embeddings_coverage(train_data['clean_text'], glove_embeddings)
test_glove_oov, test_glove_vocab_coverage, test_glove_text_coverage = check_embeddings_coverage(test_data['clean_text'], glove_embeddings)
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set clean_text'.format(train_glove_vocab_coverage, train_glove_text_coverage))
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Test Set clean_text'.format(test_glove_vocab_coverage, test_glove_text_coverage))

GloVe Embeddings cover 85.76% of vocabulary and 95.89% of text in Training Set clean_text
GloVe Embeddings cover 88.06% of vocabulary and 95.66% of text in Test Set clean_text


In [13]:
train_glove_oov, train_glove_vocab_coverage, train_glove_text_coverage = check_embeddings_coverage(train_data['kaggle_text'], glove_embeddings)
test_glove_oov, test_glove_vocab_coverage, test_glove_text_coverage = check_embeddings_coverage(test_data['kaggle_text'], glove_embeddings)
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set kaggle_text'.format(train_glove_vocab_coverage, train_glove_text_coverage))
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Test Set kaggle_text'.format(test_glove_vocab_coverage, test_glove_text_coverage))

GloVe Embeddings cover 67.95% of vocabulary and 94.84% of text in Training Set kaggle_text
GloVe Embeddings cover 75.66% of vocabulary and 94.99% of text in Test Set kaggle_text


In [14]:
train_glove_oov, train_glove_vocab_coverage, train_glove_text_coverage = check_embeddings_coverage(train_data['super_clean_text'], glove_embeddings)
test_glove_oov, test_glove_vocab_coverage, test_glove_text_coverage = check_embeddings_coverage(test_data['super_clean_text'], glove_embeddings)
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set super_clean_text'.format(train_glove_vocab_coverage, train_glove_text_coverage))
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Test Set super_clean_text'.format(test_glove_vocab_coverage, test_glove_text_coverage))

GloVe Embeddings cover 86.11% of vocabulary and 96.09% of text in Training Set super_clean_text
GloVe Embeddings cover 89.24% of vocabulary and 96.12% of text in Test Set super_clean_text


In [15]:
num_words = len(list(tokenizer.word_index)) + 1
embedding_size = 300
embedding_matrix = np.random.uniform(-1, 1, (num_words, embedding_size))
for word, i in tokenizer.word_index.items():
    if i < num_words:
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [16]:
K = 2
SEED = 1337
skf = StratifiedKFold(n_splits=K, random_state=SEED, shuffle=True)

In [49]:
class TF_Glove:
    def __init__(self, epochs, batch_size, num_filters, tokenizer, embedding_matrix, embedding_size):
        self.epochs = epochs
        self.batch_size = batch_size
        self.num_filters = num_filters
        self.tokenizer = tokenizer
        self.embedding_matrix = embedding_matrix
        self.embedding_size = embedding_size
        self.trained_model = []

        
    def encode(self, train_text, test_text):
        x_train_tokens = tokenizer.texts_to_sequences(train_text)
        x_test_tokens = tokenizer.texts_to_sequences(test_text)
        num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
        num_tokens = np.array(num_tokens)
        max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
        self.max_tokens = int(max_tokens)
        x_train_pad = pad_sequences(x_train_tokens, maxlen=self.max_tokens)
        x_test_pad = pad_sequences(x_test_tokens, maxlen=self.max_tokens)
        return x_train_pad, x_test_pad
    
    def encode_fit(self, text):
        x_final_test_tokens = tokenizer.texts_to_sequences(text)
        x_final_test_pad = pad_sequences(x_final_test_tokens, maxlen=self.max_tokens)
        return x_final_test_pad
    
    def build_model(self):
        sequence_length = self.max_tokens
        vocabulary_size = len(list(self.tokenizer.word_index)) + 1
        embedding_dim = self.embedding_size
        drop = 0.7
        weight_decay = 1e-2
        num_classes = 2
        model = Sequential()
        model.add(Embedding(input_dim=num_words,
                            output_dim=embedding_size,
                            weights= [self.embedding_matrix],
                            input_length=self.max_tokens,        
                            trainable=True,              #the layer is trained
                            name='embedding_layer'))
        model.add(Conv1D(self.num_filters, 7, activation='sigmoid', padding='same', kernel_constraint=max_norm(3), bias_constraint=max_norm(3)))
        model.add(MaxPooling1D(2))
        model.add(Conv1D(self.num_filters, 7, activation='sigmoid', padding='same', kernel_constraint=max_norm(3), bias_constraint=max_norm(3)))
        model.add(GlobalMaxPooling1D())
        model.add(Dropout(drop))
        model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
        model.add(Dense(num_classes, activation='softmax'))  #multi-label (k-hot encoding)
        adam = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0005)
        model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        model.summary()
        return model
        
    def train(self, X):
        for fold, (trn_idx, val_idx) in enumerate(skf.split(X['super_clean_text'], X['keyword_original'])):
            
            print('\nFold {}\n'.format(fold))
        
            X_trn_encoded, X_val_encoded = self.encode(X.loc[trn_idx, 'super_clean_text'].str.lower(),
                                                       X.loc[val_idx, 'super_clean_text'].str.lower())
            y_trn = X.loc[trn_idx, 'target_relabeled']
            y_val = X.loc[val_idx, 'target_relabeled']
            

            # Model
            model = self.build_model()        
            
            #define callbacks
            early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=8, verbose=1)
            callbacks_list = [early_stopping]

            model.fit(X_trn_encoded, y_trn, validation_data=(X_val_encoded, y_val), batch_size=self.batch_size, epochs=self.epochs, callbacks=callbacks_list, verbose=2)
            
            
            self.trained_model.append(model)
            
    def predict(self, X):
        X_test_encoded = self.encode_fit(X['super_clean_text'].str.lower())
        y_pred = np.zeros((X_test_encoded.shape[0], 2))

        for model in self.trained_model:
            y_pred += model.predict(X_test_encoded) / len(self.trained_model)

        return y_pred


        

In [52]:
model = TF_Glove(20, 128, 32, tokenizer, embedding_matrix, embedding_size)

In [53]:
predicted = model.train(train_data)


Fold 0

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 16, 300)           4304100   
_________________________________________________________________
conv1d_52 (Conv1D)           (None, 16, 32)            67232     
_________________________________________________________________
max_pooling1d_26 (MaxPooling (None, 8, 32)             0         
_________________________________________________________________
conv1d_53 (Conv1D)           (None, 8, 32)             7200      
_________________________________________________________________
global_max_pooling1d_26 (Glo (None, 32)                0         
_________________________________________________________________
dropout_26 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_52 (Dense)             (None, 8)      

In [54]:
predicted = model.predict(test_data)
predicted =np.argmax(predicted, axis=1) 

In [57]:
df_liked = pd.read_csv('~/Documents/Datos/DataSets/TP2/test_with_targets.csv', dtype={'id': np.int16, 'target': np.int8})
df_liked.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [58]:
y_pred = np.round(predicted).astype('int')
np.mean(y_pred.flatten() == df_liked.target)

0.8044744100520993

In [None]:
df_test['target'] = np.round(y_pred.flatten()).astype('int')
df_test[['id', 'target']].to_csv('~/Documents/Datos/DataSets/TP2/res_', index=False)
df_test.head()