In [1]:
import numpy as np
import pandas as pd
import string
import xgboost as xgb
import io
import nltk
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')
stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

from textblob import TextBlob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
sia = SentimentIntensityAnalyzer()
def return_sia_compound_values(text):
    return sia.polarity_scores(text)['compound']

In [3]:
def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

def contains_punctuation(text):
    punctuation = set(string.punctuation)
    for character in text:
        if character in punctuation:
            return True
    return False

def amount_of_punctuation(text):
    punctuation = set(string.punctuation)
    amount = 0
    for character in text:
        if character in punctuation: amount += 1
    return amount

def get_adjectives(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("JJ")])

def get_nouns(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("NN")])

def get_verbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("VB")])

def get_adverbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("RB")])

In [4]:
tweets = pd.read_csv("train.csv", usecols=['id','text', 'target'])
test = pd.read_csv("test.csv", usecols=['id','text'])

In [5]:
tweets.drop_duplicates(subset = 'text', keep = False, inplace = True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7434 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7434 non-null   int64 
 1   text    7434 non-null   object
 2   target  7434 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 232.3+ KB


# Fichur Inginierin


In [6]:
tweets_metrics = tweets[['id','text','target']]
tweets_metrics['text_without_stopwords'] = tweets_metrics['text'].str.split()
tweets_metrics['text_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(remove_stopword)

tweets_metrics['length'] = tweets_metrics['text'].apply(lambda x: len(x))
tweets_metrics['avg_word_length'] = tweets_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
tweets_metrics['amount_of_words'] = tweets_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = tweets_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
tweets_metrics['amount_of_unique_words'] = unique_words_by_tweet
tweets_metrics['sentiment'] = tweets_metrics['text'].apply(lambda x: return_sia_compound_values(x))
tweets_metrics['stopwords_count'] = tweets_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
tweets_metrics['punctuation_count'] = tweets_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = tweets_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
tweets_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = tweets_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
tweets_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
tweets_metrics['longest_word_length_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
tweets_metrics['stopword_word_ratio'] = tweets_metrics['stopwords_count'] / tweets_metrics['amount_of_words']

tweets_metrics['adjectives_count'] = tweets_metrics['text'].apply(get_adjectives)
tweets_metrics['nouns_count'] = tweets_metrics['text'].apply(get_nouns)
tweets_metrics['verbs_count'] = tweets_metrics['text'].apply(get_verbs)
tweets_metrics['adverbs_count'] = tweets_metrics['text'].apply(get_adverbs)

tweets_metrics.head()

Unnamed: 0,id,text,target,text_without_stopwords,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,longest_word_length_without_stopwords,stopword_word_ratio,adjectives_count,nouns_count,verbs_count,adverbs_count
0,1,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds Reason May ALLAH Forgive us,69,4.384615,13,13,0.2732,6,1,0,1,7,0.461538,0,6,1,0
1,4,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Canada,38,4.571429,7,7,-0.34,0,1,0,0,6,0.0,0,6,0,0
2,5,All residents asked to 'shelter in place' are ...,1,All residents asked notified No evacuation she...,133,5.090909,22,20,-0.296,11,3,0,0,10,0.5,1,7,7,0
3,6,"13,000 people receive #wildfires evacuation or...",1,people receive evacuation orders California,65,7.125,8,8,0.0,1,2,0,1,10,0.125,1,4,1,0
4,7,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent photo Ruby smoke pours school,88,4.5,16,15,0.0,7,2,0,2,6,0.4375,0,6,3,1


In [7]:
test = test[['id','text']]
test['text_without_stopwords'] = test['text'].str.split()
test['text_without_stopwords'] = test['text_without_stopwords'].apply(remove_stopword)

test['length'] = test['text'].apply(lambda x: len(x))
test['avg_word_length'] = test['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
test['amount_of_words'] = test['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = test['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
test['amount_of_unique_words'] = unique_words_by_tweet
test['sentiment'] = test['text'].apply(lambda x: return_sia_compound_values(x))
test['stopwords_count'] = test['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
test['punctuation_count'] = test['text'].apply(lambda x: amount_of_punctuation(x))
mentions = test['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
test['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = test['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
test['hashtags_count'] = hashtags.apply(lambda x: len(x))
test['longest_word_length_without_stopwords'] = test['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
test['stopword_word_ratio'] = test['stopwords_count'] / test['amount_of_words']

test['adjectives_count'] = test['text'].apply(get_adjectives)
test['nouns_count'] = test['text'].apply(get_nouns)
test['verbs_count'] = test['text'].apply(get_verbs)
test['adverbs_count'] = test['text'].apply(get_adverbs)

test.head()

Unnamed: 0,id,text,text_without_stopwords,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,longest_word_length_without_stopwords,stopword_word_ratio,adjectives_count,nouns_count,verbs_count,adverbs_count
0,0,Just happened a terrible car crash,Just happened terrible car crash,34,4.833333,6,6,-0.7003,2,0,0,0,8,0.333333,1,2,1,1
1,2,"Heard about #earthquake is different cities, s...",Heard different stay safe,64,6.222222,9,9,0.4404,2,3,0,1,9,0.222222,2,4,2,0
2,3,"there is a forest fire at spot pond, geese are...",forest fire spot geese fleeing across I cannot...,96,4.105263,19,19,-0.6159,9,2,0,0,7,0.473684,2,4,4,1
3,9,Apocalypse lighting. #Spokane #wildfires,Apocalypse,40,9.25,4,4,0.0,0,3,0,2,10,0.0,0,4,0,0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills China Taiwan,45,4.75,8,8,-0.5423,2,0,0,0,8,0.25,0,4,1,0


In [8]:
basic_features = tweets_metrics[['length','avg_word_length','amount_of_words','amount_of_unique_words','sentiment','stopwords_count','punctuation_count','mentions_count','hashtags_count','longest_word_length_without_stopwords','stopword_word_ratio','adjectives_count','nouns_count','verbs_count','adverbs_count']]
#basic_features = tweets_metrics[['length','avg_word_length','amount_of_words','sentiment', 'stopwords_count', 'punctuation_count', 'longest_word_length_without_stopwords', 'amount_of_unique_words', 'hashtags_count', 'mentions_count']]
basic_features.head()

Unnamed: 0,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,longest_word_length_without_stopwords,stopword_word_ratio,adjectives_count,nouns_count,verbs_count,adverbs_count
0,69,4.384615,13,13,0.2732,6,1,0,1,7,0.461538,0,6,1,0
1,38,4.571429,7,7,-0.34,0,1,0,0,6,0.0,0,6,0,0
2,133,5.090909,22,20,-0.296,11,3,0,0,10,0.5,1,7,7,0
3,65,7.125,8,8,0.0,1,2,0,1,10,0.125,1,4,1,0
4,88,4.5,16,15,0.0,7,2,0,2,6,0.4375,0,6,3,1


In [9]:
basic_features_test = test[['length','avg_word_length','amount_of_words','amount_of_unique_words','sentiment','stopwords_count','punctuation_count','mentions_count','hashtags_count','longest_word_length_without_stopwords','stopword_word_ratio','adjectives_count','nouns_count','verbs_count','adverbs_count']]
#basic_features_test = test[['length','avg_word_length','amount_of_words','sentiment', 'stopwords_count', 'punctuation_count', 'longest_word_length_without_stopwords', 'amount_of_unique_words', 'hashtags_count', 'mentions_count']]
basic_features_test.head()

Unnamed: 0,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,longest_word_length_without_stopwords,stopword_word_ratio,adjectives_count,nouns_count,verbs_count,adverbs_count
0,34,4.833333,6,6,-0.7003,2,0,0,0,8,0.333333,1,2,1,1
1,64,6.222222,9,9,0.4404,2,3,0,1,9,0.222222,2,4,2,0
2,96,4.105263,19,19,-0.6159,9,2,0,0,7,0.473684,2,4,4,1
3,40,9.25,4,4,0.0,0,3,0,2,10,0.0,0,4,0,0
4,45,4.75,8,8,-0.5423,2,0,0,0,8,0.25,0,4,1,0


In [10]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [11]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

# Modelos

## Redes neuronales

### Keras: CNN

In [51]:
from keras.preprocessing.text import Tokenizer

test = pd.read_csv('test.csv')
test = test[['id','text']]

x_test_kagle = test['text'].values

x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 123)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
#x_test = tokenizer.texts_to_sequences(x_test)
x_test_kagle = tokenizer.texts_to_sequences(x_test_kagle)
vocab_size = len(tokenizer.word_index) + 1

from keras.preprocessing.sequence import pad_sequences

maxlen = 100

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
#x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)
x_test_kagle = pad_sequences(x_test_kagle, padding='post', maxlen=maxlen)

In [52]:
from keras.models import Sequential
from keras import layers
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 100)          2281100   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 96, 128)           64128     
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 11        
Total params: 2,346,529
Trainable params: 2,346,529
Non-trainable params: 0
_________________________________________________________________


In [94]:
history = model.fit(x_train, y_train,
                    epochs=2,
                    verbose=1,
                    #validation_data=(x_test, y_test),
                    batch_size=65)
#loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
#print("Training Accuracy: {:.4f}".format(accuracy))
#loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
#print("Testing Accuracy:  {:.4f}".format(accuracy))


Epoch 1/2
Epoch 2/2


In [95]:
submission1 = pd.DataFrame()
submission1['id'] = test['id']
submission1['prob'] = model.predict(x_test_kagle)
submission1['target'] = submission1['prob'].apply(lambda x: 0 if x < .5 else 1)
submission1.head(10)

Unnamed: 0,id,prob,target
0,0,0.879785,1
1,2,0.90873,1
2,3,0.743716,1
3,9,0.566755,1
4,11,0.961872,1
5,12,0.941275,1
6,21,0.066874,0
7,22,0.047537,0
8,27,0.05339,0
9,29,0.135765,0


In [96]:
del submission1['prob']

In [97]:
submission1.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [98]:
submission1.to_csv("submit_prueba_3.csv", index=False)

Una mierda (al final no xd)

## Luego del submit, mas pruebas con esto (probando con Glove) (dio 0,80570)

In [9]:
from keras.preprocessing.text import Tokenizer

#test = pd.read_csv('test.csv')
#test = test[['id','text']]

#x_test_kagle = test['text'].values

#x = tweets_metrics['text'].values
#y = tweets_metrics['target'].values
x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.23, random_state = 123)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
#x_test = tokenizer.texts_to_sequences(x_test)
#x_test_kagle = tokenizer.texts_to_sequences(x_test_kagle)
vocab_size = len(tokenizer.word_index) + 1

from keras.preprocessing.sequence import pad_sequences

maxlen = 140

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
#x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)
#x_test_kagle = pad_sequences(x_test_kagle, padding='post', maxlen=maxlen)

Using TensorFlow backend.


In [10]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

embedding_dim = 100
#embedding_matrix = create_embedding_matrix('glove.6B.200d.txt',tokenizer.word_index, embedding_dim)
embedding_matrix = create_embedding_matrix('glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim)

In [13]:
from keras.models import Sequential
from keras.layers import Dropout, Flatten
from keras import layers
#embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.Conv1D(32, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 140, 100)          2281100   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 134, 128)          89728     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 12)                1548      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 13        
Total params: 2,372,389
Trainable params: 2,372,389
Non-trainable params: 0
_________________________________________________________________


In [14]:
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor = 'val_loss', patience = 1)

history = model.fit(x_train, y_train,
                    epochs=15,
                    verbose=1,
                    #validation_data=(x_test, y_test),
                    validation_split=0.1,
                    batch_size=88,
                    callbacks = [callback])


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [162]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=1)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
print("Testing Accuracy:  {:.4f}, Loss  {:.4f}".format(accuracy,loss))

Training Accuracy: 0.9415
Testing Accuracy:  0.8158, Loss  0.4759


In [None]:
plot_history(history)

In [15]:
test = pd.read_csv('test.csv')
test = test[['id','text']]

x_test_kagle = test['text'].values
x_test_kagle = tokenizer.texts_to_sequences(x_test_kagle)
x_test_kagle = pad_sequences(x_test_kagle, padding='post', maxlen=maxlen)

In [16]:
submission1 = pd.DataFrame()
submission1['id'] = test['id']
submission1['prob'] = model.predict(x_test_kagle)
submission1['target'] = submission1['prob'].apply(lambda x: 0 if x < .5 else 1)
submission1.head(10)

Unnamed: 0,id,prob,target
0,0,0.806478,1
1,2,0.946599,1
2,3,0.87062,1
3,9,0.914281,1
4,11,0.977202,1
5,12,0.80577,1
6,21,0.018496,0
7,22,0.021431,0
8,27,0.010347,0
9,29,0.072302,0


In [17]:
del submission1['prob']
submission1.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [18]:
submission1.to_csv("submit_prueba_14.csv", index=False)

### Keras: K fold CV

In [13]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [27]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [51]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dropout, Flatten
from keras.preprocessing.sequence import pad_sequences
from keras import layers
from keras.preprocessing.text import Tokenizer
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)

x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values

# Main settings
epochs = 15
embedding_dim = 100
maxlen = 140

# Train-test split
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
#x_test = tokenizer.texts_to_sequences(x_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
#x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

embedding_matrix = create_embedding_matrix('glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 128, 144],
                      kernel_size=[3, 5, 7],
                      vocab_size=[vocab_size],
                      embedding_dim=[embedding_dim],
                      maxlen=[maxlen],
                      batch_size = [45,65,76,88])

model = KerasClassifier(build_fn=create_model,
                            epochs=epochs, validation_split=0.1,
                            verbose=1)

grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                              cv=4, verbose=2, n_iter=5, n_jobs=1)

grid_result = grid.fit(x_train, y_train, callbacks=[callback])

# Evaluate testing set
#test_accuracy = grid.score(x_test, y_test)

# Save and evaluate results
s = ('Best Accuracy : {:.4f}\n{}\n\n\n')
output_string = s.format(
            grid_result.best_score_,
            grid_result.best_params_)
            
print(output_string)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45 
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Train on 5017 samples, validate on 558 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: early stopping
[CV]  vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45, total=  29.7s
[CV] vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45 
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.6s remaining:    0.0s
Train on 5017 samples, validate on 558 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: early stopping
[CV]  vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45, total=  29.8s
[CV] vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45 
Train on 5018 samples, valid

### Submit del 0,81274 con lo extraido del random search

In [56]:
from keras.preprocessing.text import Tokenizer

#test = pd.read_csv('test.csv')
#test = test[['id','text']]

#x_test_kagle = test['text'].values

#x = tweets_metrics['text'].values
#y = tweets_metrics['target'].values
x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 123)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
#x_test = tokenizer.texts_to_sequences(x_test)
#x_test_kagle = tokenizer.texts_to_sequences(x_test_kagle)
vocab_size = len(tokenizer.word_index) + 1

from keras.preprocessing.sequence import pad_sequences

maxlen = 140

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
#x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)
#x_test_kagle = pad_sequences(x_test_kagle, padding='post', maxlen=maxlen)

In [57]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

embedding_dim = 100
embedding_matrix = create_embedding_matrix('glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim)

In [60]:
from keras.models import Sequential
from keras.layers import Dropout, Flatten
from keras import layers
#embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True))
model.add(layers.Conv1D(128, 7, activation='relu'))
#model.add(layers.Conv1D(32, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_156"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_156 (Embedding)    (None, 140, 100)          2281100   
_________________________________________________________________
conv1d_156 (Conv1D)          (None, 134, 128)          89728     
_________________________________________________________________
global_max_pooling1d_156 (Gl (None, 128)               0         
_________________________________________________________________
dense_311 (Dense)            (None, 10)                1290      
_________________________________________________________________
dense_312 (Dense)            (None, 1)                 11        
Total params: 2,372,129
Trainable params: 2,372,129
Non-trainable params: 0
_________________________________________________________________


In [61]:
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)

history = model.fit(x_train, y_train,
                    epochs=3,
                    verbose=1,
                    #validation_data=(x_test, y_test),
                    #validation_split=0.1,
                    batch_size=88,
                    callbacks = [callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=1)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
print("Testing Accuracy:  {:.4f}, Loss  {:.4f}".format(accuracy,loss))

In [62]:
test = pd.read_csv('test.csv')
test = test[['id','text']]

x_test_kagle = test['text'].values
x_test_kagle = tokenizer.texts_to_sequences(x_test_kagle)
x_test_kagle = pad_sequences(x_test_kagle, padding='post', maxlen=maxlen)

In [63]:
submission1 = pd.DataFrame()
submission1['id'] = test['id']
submission1['prob'] = model.predict(x_test_kagle)
submission1['target'] = submission1['prob'].apply(lambda x: 0 if x < .5 else 1)
del submission1['prob']
submission1.to_csv("submit_prueba_13.csv", index=False)

# Pruebas con los features

## Prueba GridSearch

In [10]:
basic_features = tweets_metrics[['length','avg_word_length','amount_of_words','amount_of_unique_words','sentiment','stopwords_count','punctuation_count','mentions_count','hashtags_count','longest_word_length_without_stopwords','stopword_word_ratio','adjectives_count','nouns_count','verbs_count','adverbs_count']]
#basic_features = tweets_metrics[['length','avg_word_length','amount_of_words','sentiment', 'stopwords_count', 'punctuation_count', 'longest_word_length_without_stopwords', 'amount_of_unique_words', 'hashtags_count', 'mentions_count']]

In [11]:
basic_features_test = test[['length','avg_word_length','amount_of_words','amount_of_unique_words','sentiment','stopwords_count','punctuation_count','mentions_count','hashtags_count','longest_word_length_without_stopwords','stopword_word_ratio','adjectives_count','nouns_count','verbs_count','adverbs_count']]
#basic_features_test = test[['length','avg_word_length','amount_of_words','sentiment', 'stopwords_count', 'punctuation_count', 'longest_word_length_without_stopwords', 'amount_of_unique_words', 'hashtags_count', 'mentions_count']]

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Concatenate, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Lambda, Activation, GaussianNoise, GaussianDropout
from keras import layers, Input, Model
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score

In [237]:
def create_conv1d(num_filters, kernel_size,vocab_size,embedding_dim,maxlen,batch_size,dense1_size,dense2_size):
        
    embedding = Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True)
    x_train_input = Input(shape = (maxlen,), name = 'x_train_input')
    x_train_features_input = Input(shape = (15, ), name = 'x_features_train')
    emb = embedding(x_train_input)
    
    conv_out = Conv1D(num_filters, kernel_size, activation='relu')(emb)

    max_pool = GlobalMaxPooling1D()(conv_out)
    conc = Concatenate()([max_pool, x_train_features_input])
    #x = Dropout(0.2)(conc)
    
    dense1 = Dense(dense1_size, activation='relu')(conc)
    dense2 = Dense(dense2_size, activation='relu')(dense1)

    #dense3 = Dense(1, activation='sigmoid')(dense2)
    
    model = Model(inputs = [x_train_input , x_train_features_input], outputs = dense2)
    #optimizer = Adam() #default
    #model.compile(optimizer=optimizer,
              #loss='binary_crossentropy',
              #metrics=['accuracy'])
    
    return model

def create_port_to_model(num_filters, kernel_size,vocab_size,embedding_dim,maxlen,batch_size,dense1_size,dense2_size):
    combi_input = Input(shape = (155,), name = 'port')
    input_train = Lambda(lambda x: x[:,:-15])(combi_input)
    input_features = Lambda(lambda x: x[:,140:])(combi_input)

    base_network = create_conv1d(num_filters, kernel_size,vocab_size,embedding_dim,maxlen,batch_size,dense1_size,dense2_size)
    processed = base_network([input_train,input_features])


    dense3 = Dense(1, activation='sigmoid')(processed)
    model = Model(combi_input,dense3)
    optimizer = Adam() #default
    model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

    return model


In [238]:
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)

x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values
features = StandardScaler()
x_train_features = features.fit_transform(basic_features)

# Main settings
epochs = 15
embedding_dim = 100
maxlen = 140
# Tokenize words
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)

vocab_size = len(tokenizer.word_index) + 1

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)

embedding_matrix = create_embedding_matrix('glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim)

In [239]:

# Parameter grid for grid search
param_grid = dict(num_filters=[128],
                      kernel_size=[7],
                      vocab_size=[vocab_size],
                      embedding_dim=[embedding_dim],
                      maxlen=[maxlen],
                      batch_size = [88],
                      dense1_size = [10],
                      dense2_size = [20,10])

model = KerasClassifier(build_fn=create_port_to_model,
                            epochs=epochs, validation_split=0.1,
                            verbose=1)

grid = GridSearchCV(estimator=model, param_grid=param_grid,
                              cv=4, verbose=2, n_jobs=1)

grid_result = grid.fit(np.concatenate((x_train,x_train_features), axis = 1), y_train, callbacks=[callback])

# Save and evaluate results
s = ('Best Accuracy : {:.4f}\n{}\n\n\n')
output_string = s.format(
            grid_result.best_score_,
            grid_result.best_params_)
            
print(output_string)

Fitting 4 folds for each of 2 candidates, totalling 8 fits
[CV] batch_size=88, dense1_size=10, dense2_size=20, embedding_dim=100, kernel_size=7, maxlen=140, num_filters=128, vocab_size=22811 
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Train on 5017 samples, validate on 558 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 00005: early stopping
[CV]  batch_size=88, dense1_size=10, dense2_size=20, embedding_dim=100, kernel_size=7, maxlen=140, num_filters=128, vocab_size=22811, total=  46.1s
[CV] batch_size=88, dense1_size=10, dense2_size=20, embedding_dim=100, kernel_size=7, maxlen=140, num_filters=128, vocab_size=22811 
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   46.0s remaining:    0.0s
Train on 5017 samples, validate on 558 samples
Epoch 1/15
Epoch 2/15
Epoch 00002: early stopping
[CV]  batch_size=88, dense1_size=10, dense2_size=20, embedding_dim=100, kernel_size=7, maxlen=140, num_filters=128, vocab_size=22811, total

In [242]:
pd.DataFrame(grid_result.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_batch_size,param_dense1_size,param_dense2_size,param_embedding_dim,param_kernel_size,param_maxlen,param_num_filters,param_vocab_size,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,28.782677,10.218281,0.639944,0.032818,88,10,20,100,7,140,128,22811,"{'batch_size': 88, 'dense1_size': 10, 'dense2_...",0.789672,0.766541,0.771798,0.78902,0.779258,0.010261,2
1,36.36059,6.298348,0.661582,0.020972,88,10,10,100,7,140,128,22811,"{'batch_size': 88, 'dense1_size': 10, 'dense2_...",0.791286,0.786444,0.778256,0.785791,0.785444,0.004661,1


In [199]:
np.concatenate((x_train,x_train_features), axis = 1).shape

(7434, 155)

In [205]:
primer_vector = []
x_train_concat = np.concatenate((x_train,x_train_features), axis = 1)
for array in x_train_concat:
    primer_vector.append(array[:-15])
segundo_vector = []
for i in range(len(primer_vector)):
    segundo_vector.append(np.array(primer_vector[i]))
x_train_recuperado = np.stack(segundo_vector)

tercer_vector = []
for array in x_train_concat:
    tercer_vector.append(array[140:])
cuarto_vector = []
for i in range(len(tercer_vector)):
    cuarto_vector.append(np.array(tercer_vector[i]))
x_train__features_recuperado = np.stack(cuarto_vector)

(7434, 15)

## Prueba individual

In [117]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Concatenate, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Lambda, Activation, GaussianNoise, GaussianDropout
from keras import layers, Input, Model
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from tensorflow.keras.constraints import max_norm, unit_norm, min_max_norm
from tensorflow import keras

In [139]:
features = StandardScaler()
x_train_features = features.fit_transform(basic_features)

x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 50
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)

In [140]:
def create_embedding_matrix_840(route, word_index, embedding_dim):
    embeddings_index = {}
    f = open(route, encoding='utf8')
    for line in f:
        values = line.split()
        word = ''.join(values[:-embedding_dim])
        coefs = np.asarray(values[-embedding_dim:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: # Si la palabra no esta queda llena de 0s
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [141]:
embedding_dim = 100
embedding_matrix = create_embedding_matrix('glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim)

In [171]:
embedding_dim_1 = 100
embedding_matrix_1 = create_embedding_matrix('glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim_1)

embedding_dim_2 = 300
embedding_matrix_2 = create_embedding_matrix_840('glove.840B.300d.txt',tokenizer.word_index, embedding_dim_2)

embedding_dim_3 = 200
embedding_matrix_3 = create_embedding_matrix('glove.6B.200d.txt',tokenizer.word_index, embedding_dim_3)

In [205]:
def create_conv1d(): #original
        
    embedding = Embedding(vocab_size, embedding_dim_1, input_length=maxlen, weights=[embedding_matrix_1], trainable=True)
    embedding2 = Embedding(vocab_size, embedding_dim_2, input_length=maxlen, weights=[embedding_matrix_2], trainable=True)
    embedding3 = Embedding(vocab_size, embedding_dim_3, input_length=maxlen, weights=[embedding_matrix_3], trainable=True)

    x_train_input = Input(shape = (maxlen,), name = 'x_train_input')
    x_train_features_input = Input(shape = (15, ), name = 'x_features_train')

    emb = embedding(x_train_input)
    emb2 = embedding2(x_train_input)
    emb3 = embedding3(x_train_input)

    #Emb 100
    conv_out1_1 = Conv1D(128, 2, activation='relu')(emb)
    activation1_1 = Activation('relu')(conv_out1_1)
    max_pool1_1 = GlobalMaxPooling1D()(activation1_1)
    conv_out1_2 = Conv1D(128, 3, activation='relu')(emb)
    activation1_2 = Activation('relu')(conv_out1_2)
    max_pool1_2 = GlobalMaxPooling1D()(activation1_2)

    #Emb 200
    conv_out2_1 = Conv1D(128, 2, activation='relu')(emb2)
    max_pool2_1 = GlobalMaxPooling1D()(conv_out2_1)
    conv_out2_2 = Conv1D(128, 3, activation='relu')(emb2)
    max_pool2_2 = GlobalMaxPooling1D()(conv_out2_2)

    #Emb 300
    conv_out3_1 = Conv1D(128, 2, activation='relu')(emb3)
    max_pool3_1 = GlobalMaxPooling1D()(conv_out3_1)
    conv_out3_2 = Conv1D(128, 2, activation='relu')(emb3)
    max_pool3_2 = GlobalMaxPooling1D()(conv_out3_2)

    conc = Concatenate()([max_pool1_1, max_pool1_2, max_pool2_1, max_pool2_2, max_pool3_1, max_pool3_2, x_train_features_input])
    
    #noise1 = GaussianNoise(0.01)(conc)
    dense1 = Dense(100, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01))(conc)
    noise2 = GaussianNoise(0.1)(dense1)
    
    dense2 = Dense(10, activation='relu')(noise2)
    #noise1 = GaussianNoise(0.001)(dense2)
    dense3 = Dense(1, activation='sigmoid')(dense2)
    
    model = Model(inputs = [x_train_input , x_train_features_input], outputs = dense3)
    optimizer = Adam() #default
    model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model

In [206]:
model = create_conv1d()
#model.summary()
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)

history = model.fit([x_train,x_train_features], y_train,
                    epochs=3,
                    verbose=1,
                    #validation_data=(x_test, y_test),
                    #validation_split=0.2,
                    batch_size=64,
                    callbacks = [callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [207]:
result = pd.read_csv('perfect_submission.csv')
test_kagle = test[['id','text']]

x_test_kagle = test['text'].values
x_test_kagle = tokenizer.texts_to_sequences(x_test_kagle)
x_test_kagle = pad_sequences(x_test_kagle, padding='post', maxlen=maxlen)
x_test_features = features.transform(basic_features_test)

submit_df = pd.DataFrame()
submit_df['id'] = test_kagle['id']
submit_df['prob'] = model.predict([x_test_kagle,x_test_features])
submit_df['target'] = submit_df['prob'].apply(lambda x: 0 if x < .5 else 1)

accuracy = accuracy_score(submit_df['target'], result['target'])
print(accuracy)

0.7667790376953724


In [None]:
0.820410665032179

In [231]:
del submission1['prob']
submission1.to_csv("submit_prueba_34.csv", index=False)

### Evaluación de pre-trained words.

In [13]:
glove_embeddings = np.load('glove.840B.300d.pkl', allow_pickle=True)
fasttext_embeddings = np.load('crawl-300d-2M.pkl', allow_pickle=True)

In [15]:
import operator
def build_vocab(X):
    
    tweets = X.apply(lambda s: s.split()).values      
    vocab = {}
    
    for tweet in tweets:
        for word in tweet:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1                
    return vocab


def check_embeddings_coverage(X, embeddings):
    
    vocab = build_vocab(X)    
    
    covered = {}
    oov = {}    
    n_covered = 0
    n_oov = 0
    
    for word in vocab:
        try:
            covered[word] = embeddings[word]
            n_covered += vocab[word]
        except:
            oov[word] = vocab[word]
            n_oov += vocab[word]
            
    vocab_coverage = len(covered) / len(vocab)
    text_coverage = (n_covered / (n_covered + n_oov))
    
    sorted_oov = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_oov, vocab_coverage, text_coverage

train_glove_oov, train_glove_vocab_coverage, train_glove_text_coverage = check_embeddings_coverage(tweets_metrics['text'], glove_embeddings)
test_glove_oov, test_glove_vocab_coverage, test_glove_text_coverage = check_embeddings_coverage(test['text'], glove_embeddings)
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set'.format(train_glove_vocab_coverage, train_glove_text_coverage))
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Test Set'.format(test_glove_vocab_coverage, test_glove_text_coverage))

train_fasttext_oov, train_fasttext_vocab_coverage, train_fasttext_text_coverage = check_embeddings_coverage(tweets_metrics['text'], fasttext_embeddings)
test_fasttext_oov, test_fasttext_vocab_coverage, test_fasttext_text_coverage = check_embeddings_coverage(test['text'], fasttext_embeddings)
print('FastText Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set'.format(train_fasttext_vocab_coverage, train_fasttext_text_coverage))
print('FastText Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Test Set'.format(test_fasttext_vocab_coverage, test_fasttext_text_coverage))

GloVe Embeddings cover 52.17% of vocabulary and 82.73% of text in Training Set
GloVe Embeddings cover 57.21% of vocabulary and 81.85% of text in Test Set
FastText Embeddings cover 51.63% of vocabulary and 81.88% of text in Training Set
FastText Embeddings cover 56.55% of vocabulary and 81.12% of text in Test Set


In [33]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.5652974442155101