In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.tokenize import TweetTokenizer
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
pd.set_option('max_colwidth',400)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


#### Reading dataset

In [2]:
train = pd.read_csv('./data/train.tsv', sep="\t")
test = pd.read_csv('./data/test.tsv', sep="\t")
sub = pd.read_csv('./data/sampleSubmission.csv', sep=",")

In [3]:
train.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what is good for the goose,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is good for the goose,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for the goose,2


In [4]:
train.loc[train.SentenceId == 2]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
63,64,2,"This quiet , introspective and entertaining independent is worth seeking .",4
64,65,2,"This quiet , introspective and entertaining independent",3
65,66,2,This,2
66,67,2,"quiet , introspective and entertaining independent",4
67,68,2,"quiet , introspective and entertaining",3
68,69,2,quiet,2
69,70,2,", introspective and entertaining",3
70,71,2,introspective and entertaining,3
71,72,2,introspective and,3
72,73,2,introspective,2


In [5]:
print('Average count of phrases per sentence in train is {0:.0f}.'
      .format(train.groupby('SentenceId')['Phrase'].count().mean()))
print('Average count of phrases per sentence in test is {0:.0f}.'
      .format(test.groupby('SentenceId')['Phrase'].count().mean()))

Average count of phrases per sentence in train is 18.
Average count of phrases per sentence in test is 20.


In [6]:
print('Number of phrases in train: {}. Number of sentences in train: {}.'
      .format(train.shape[0], len(train.SentenceId.unique())))
print('Number of phrases in test: {}. Number of sentences in test: {}.'
      .format(test.shape[0], len(test.SentenceId.unique())))

Number of phrases in train: 156060. Number of sentences in train: 8529.
Number of phrases in test: 66292. Number of sentences in test: 3310.


In [7]:
print(train.shape, test.shape)

(156060, 4) (66292, 3)


In [8]:
print('Average word length of phrases in train is {0:.0f}.'
      .format(np.mean(train['Phrase'].apply(lambda x: len(x.split())))))
print('Average word length of phrases in test is {0:.0f}.'
      .format(np.mean(test['Phrase'].apply(lambda x: len(x.split())))))

Average word length of phrases in train is 7.
Average word length of phrases in test is 7.


#### Let's see for example most common trigrams for positive phrases

In [9]:
text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values)
text_trigrams = [i for i in ngrams(text.split(), 3)]

In [10]:
print(train.loc[train.Sentiment == 4, 'Phrase'].values)

['This quiet , introspective and entertaining independent is worth seeking .'
 'quiet , introspective and entertaining independent' 'entertaining' ...
 'with universal appeal'
 'really do a great job of anchoring the characters in the emotional realities of middle age .'
 'a great job of anchoring the characters in the emotional realities of middle age']


In [11]:
Counter(text_trigrams).most_common(30)

[(('one', 'of', 'the'), 199),
 (('of', 'the', 'year'), 103),
 (('.', 'is', 'a'), 87),
 (('of', 'the', 'best'), 80),
 (('of', 'the', 'most'), 70),
 (('is', 'one', 'of'), 50),
 (('One', 'of', 'the'), 43),
 ((',', 'and', 'the'), 40),
 (('the', 'year', "'s"), 38),
 (('It', "'s", 'a'), 38),
 (('it', "'s", 'a'), 37),
 (('.', "'s", 'a'), 37),
 (('a', 'movie', 'that'), 35),
 (('the', 'edge', 'of'), 34),
 (('the', 'kind', 'of'), 33),
 (('of', 'your', 'seat'), 33),
 (('the', 'film', 'is'), 31),
 ((',', 'this', 'is'), 31),
 (('the', 'film', "'s"), 31),
 ((',', 'the', 'film'), 30),
 (('film', 'that', 'is'), 30),
 (('as', 'one', 'of'), 30),
 (('edge', 'of', 'your'), 29),
 ((',', 'it', "'s"), 27),
 (('a', 'film', 'that'), 27),
 (('as', 'well', 'as'), 27),
 ((',', 'funny', ','), 25),
 ((',', 'but', 'it'), 23),
 (('films', 'of', 'the'), 23),
 (('some', 'of', 'the'), 23)]

In [12]:
import nltk
nltk.download('stopwords')

text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values)
text = [i for i in text.split() if i not in stopwords.words('english')]
text_trigrams = [i for i in ngrams(text, 3)]
Counter(text_trigrams).most_common(30)

[nltk_data] Downloading package stopwords to /Users/henry/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[((',', 'funny', ','), 33),
 (('one', 'year', "'s"), 28),
 (('year', "'s", 'best'), 26),
 (('movies', 'ever', 'made'), 19),
 ((',', 'solid', 'cast'), 19),
 (('solid', 'cast', ','), 18),
 (("'ve", 'ever', 'seen'), 16),
 (('.', 'It', "'s"), 16),
 ((',', 'making', 'one'), 15),
 (('best', 'films', 'year'), 15),
 ((',', 'touching', ','), 15),
 (('exquisite', 'acting', ','), 15),
 (('acting', ',', 'inventive'), 14),
 ((',', 'inventive', 'screenplay'), 14),
 (('jaw-dropping', 'action', 'sequences'), 14),
 (('good', 'acting', ','), 14),
 (("'s", 'best', 'films'), 14),
 (('I', "'ve", 'seen'), 14),
 (('funny', ',', 'even'), 14),
 (('best', 'war', 'movies'), 13),
 (('purely', 'enjoyable', 'satisfying'), 13),
 (('funny', ',', 'touching'), 13),
 ((',', 'smart', ','), 13),
 (('inventive', 'screenplay', ','), 13),
 (('funniest', 'jokes', 'movie'), 13),
 (('action', 'sequences', ','), 13),
 (('sequences', ',', 'striking'), 13),
 ((',', 'striking', 'villains'), 13),
 (('exquisite', 'motion', 'picture')

In [13]:
tokenizer = TweetTokenizer()

In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(train['Phrase'].values) + list(test['Phrase'].values)
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(train['Phrase'])
test_vectorized = vectorizer.transform(test['Phrase'])

In [15]:
y = train['Sentiment']
print(y.shape)

(156060,)


In [16]:
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

In [17]:
%%time
ovr.fit(train_vectorized, y)



CPU times: user 8.18 s, sys: 639 ms, total: 8.82 s
Wall time: 4.67 s


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [18]:
scores = cross_val_score(ovr, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 56.55%, std 0.07.


In [19]:
%%time
svc = LinearSVC(dual=False)
scores = cross_val_score(svc, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 56.51%, std 0.68.
CPU times: user 55.5 ms, sys: 28 ms, total: 83.5 ms
Wall time: 10.3 s


In [20]:
ovr.fit(train_vectorized, y);
svc.fit(train_vectorized, y);



### Deep Learning Version

In [21]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
#from keras.layers import CuDNNGRU, CuDNNLSTM
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

Using TensorFlow backend.


In [22]:
tk = Tokenizer(lower = True, filters='')
tk.fit_on_texts(full_text)

In [23]:
train_tokenized = tk.texts_to_sequences(train['Phrase'])
test_tokenized = tk.texts_to_sequences(test['Phrase'])

In [24]:
max_len = 50
X_train = pad_sequences(train_tokenized, maxlen = max_len)
X_test = pad_sequences(test_tokenized, maxlen = max_len)

In [25]:
embedding_path = "./pre-trained/crawl-300d-2M.vec"

In [26]:
embed_size = 300
max_features = 30000

In [27]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [28]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
y_ohe = ohe.fit_transform(y.values.reshape(-1, 1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [37]:
def build_model1(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "best_model.hdf5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    
    # early stop
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
    
    inp = Input(shape = (max_len,))
    x = Embedding(19479, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    
    # adding drop out for regularization
    x1 = SpatialDropout1D(spatial_dr)(x)

    
    x_gru = GRU(units, return_sequences = True)(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x1)
    max_pool1_gru = GlobalMaxPooling1D()(x1)
    
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x1)
    avg_pool3_gru = GlobalAveragePooling1D()(x3)
    max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    x_lstm = LSTM(units, return_sequences = True)(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    
    x = concatenate([avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(5, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [38]:
model1 = build_model1(
    lr = 1e-3, 
    lr_d = 1e-10, 
    units = 64, 
    spatial_dr = 0.3, 
    kernel_size1=3, 
    kernel_size2=2, 
    dense_units=32, 
    dr=0.1, 
    conv_size=32)


Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.31792, saving model to best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.31792 to 0.31215, saving model to best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.31215 to 0.30849, saving model to best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.30849
Epoch 5/20

Epoch 00005: val_loss improved from 0.30849 to 0.30813, saving model to best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 0.30813 to 0.30058, saving model to best_model.hdf5
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.30058
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.30058
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.30058


In [39]:
model2 = build_model1(
    lr = 1e-3,
    lr_d = 1e-10,
    units = 128,
    spatial_dr = 0.5,
    kernel_size1 = 3,
    kernel_size2 = 2,
    dense_units = 64,
    dr = 0.2,
    conv_size = 32
)

Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.32495, saving model to best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.32495 to 0.31445, saving model to best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.31445
Epoch 4/20

Epoch 00004: val_loss improved from 0.31445 to 0.30995, saving model to best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.30995 to 0.30486, saving model to best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.30486
Epoch 7/20

Epoch 00007: val_loss improved from 0.30486 to 0.30392, saving model to best_model.hdf5
Epoch 8/20

Epoch 00008: val_loss improved from 0.30392 to 0.30113, saving model to best_model.hdf5
Epoch 9/20

Epoch 00009: val_loss improved from 0.30113 to 0.29707, saving model to best_model.hdf5
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.29707
Epoch 11/20

Epoch 00011: val_loss improved from 0.29707 to 0.2

In [34]:
def build_model2(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "best_model.hdf5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

    inp = Input(shape = (max_len,))
    x = Embedding(19479, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(GRU(units, return_sequences = True))(x1)
    x_lstm = Bidirectional(LSTM(units, return_sequences = True))(x1)
    
    x_conv1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x_conv1)
    max_pool1_gru = GlobalMaxPooling1D()(x_conv1)
    
    x_conv2 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool2_gru = GlobalAveragePooling1D()(x_conv2)
    max_pool2_gru = GlobalMaxPooling1D()(x_conv2)
    
    
    x_conv3 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x_conv3)
    max_pool1_lstm = GlobalMaxPooling1D()(x_conv3)
    
    x_conv4 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool2_lstm = GlobalAveragePooling1D()(x_conv4)
    max_pool2_lstm = GlobalMaxPooling1D()(x_conv4)
    
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool2_gru, max_pool2_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool2_lstm, max_pool2_lstm])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(5, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [35]:
model3 = build_model2(
    lr = 1e-4, 
    lr_d = 0, 
    units = 64, 
    spatial_dr = 0.5, 
    kernel_size1=4, 
    kernel_size2=3, 
    dense_units=32, 
    dr=0.1, 
    conv_size=32
)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.39329, saving model to best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.39329 to 0.34542, saving model to best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.34542 to 0.32956, saving model to best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.32956 to 0.32457, saving model to best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.32457 to 0.31930, saving model to best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 0.31930 to 0.31746, saving model to best_model.hdf5
Epoch 7/20

Epoch 00007: val_loss improved from 0.31746 to 0.31530, saving model to best_model.hdf5
Epoch 8/20

Epoch 00008: val_loss improved from 0.31530 to 0.31260, saving model to best_model.hdf5
Epoch 9/20

Epoch 00009: va

In [36]:
model4 = build_model2(
    lr = 1e-3, 
    lr_d = 0, 
    units = 64, 
    spatial_dr = 0.5, 
    kernel_size1=3, 
    kernel_size2=3, 
    dense_units=64, 
    dr=0.3, 
    conv_size=32
)

Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.32095, saving model to best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.32095 to 0.31414, saving model to best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.31414 to 0.30865, saving model to best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.30865 to 0.30689, saving model to best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.30689
Epoch 6/20

Epoch 00006: val_loss improved from 0.30689 to 0.30106, saving model to best_model.hdf5
Epoch 7/20

Epoch 00007: val_loss improved from 0.30106 to 0.29995, saving model to best_model.hdf5
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.29995
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.29995
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.29995


In [40]:
model5 = build_model2(
    lr = 1e-3, 
    lr_d = 1e-7, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=3, dense_units=64, dr=0.4, conv_size=64)

Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.31726, saving model to best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.31726 to 0.30967, saving model to best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.30967
Epoch 4/20

Epoch 00004: val_loss improved from 0.30967 to 0.30764, saving model to best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.30764 to 0.30304, saving model to best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.30304
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.30304
Epoch 8/20

Epoch 00008: val_loss improved from 0.30304 to 0.30267, saving model to best_model.hdf5
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.30267
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.30267
Epoch 11/20

Epoch 00011: val_loss did not improve from 0.30267


In [41]:
pred1 = model1.predict(X_test, batch_size = 1024, verbose = 1)
pred = pred1
pred2 = model2.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred2
pred3 = model3.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred3
pred4 = model4.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred4
pred5 = model5.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred5



In [42]:
predictions = np.round(np.argmax(pred, axis=1)).astype(int)
sub['Sentiment'] = predictions
sub.to_csv("blend.csv", index=False)

In [73]:
pred2

array([[6.5658092e-03, 1.6220912e-01, 4.9259537e-01, 3.4582603e-01,
        1.4740527e-02],
       [5.1175356e-03, 1.6310227e-01, 5.4971617e-01, 2.9768705e-01,
        8.9170635e-03],
       [2.0127296e-03, 4.4901222e-02, 8.8044274e-01, 7.1183890e-02,
        1.0857284e-03],
       ...,
       [4.7925979e-02, 5.8758914e-01, 3.5295570e-01, 8.7141693e-03,
        3.3050776e-05],
       [7.6015413e-02, 6.2955600e-01, 2.7752274e-01, 6.1512887e-03,
        2.4378300e-05],
       [4.6920449e-02, 6.2212574e-01, 3.2350203e-01, 6.4088628e-03,
        1.7316592e-05]], dtype=float32)

In [46]:
sub1 = pd.read_csv('./data/sampleSubmission.csv', sep=",")

In [48]:
len(sub1)

66292

In [57]:
acc = 0
sub1['predict'] = predictions

In [66]:
a = sub1['Sentiment'] == sub1['predict']


In [70]:
acc = 0
for ele in a:
    if ele:
        acc += 1

In [71]:
acc

38414