In [1]:
# Import everything
import numpy as np
from keras import Sequential
from keras.layers import Embedding
from sklearn.linear_model import LogisticRegression
import time
from preprocessing_and_loading_data.DataLoader import DataLoader

Using TensorFlow backend.


In [2]:
def run_baseline(glove_dimension):
    print("Running model for %d glove dimension..." % (glove_dimension))
    # Read the data using DataLoader object
    max_words=40
    dl = DataLoader(glove_dimension, max_words, full=True)

    X_train, X_test, Y_train, Y_test = dl.get_train_test_split()
    embedding_matrix = dl.get_embedding_matrix()
    
    # Create keras Embedding model, to transform the sequences of words to embedding vectors
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], input_length=max_words, \
                             weights=[embedding_matrix], name='emb'))
    
    # Transforming the words to vectors
    X_train_new = model.predict(X_train)
    X_test_new = model.predict(X_test)
    
    # Compute the vector of a tweet as a mean value of the vectors of the words in the tweet
    X_train_new = np.mean(X_train_new, axis=1)
    X_test_new = np.mean(X_test_new, axis=1)
    
    start = time.time()
    clf = LogisticRegression(random_state=0, solver="sag", n_jobs=-1).fit(X_train_new, Y_train)

    end = time.time()
    print("Model trained in %.2f seconds" % (end-start))

    print("Model score: %s\n" % (clf.score(X_test_new, Y_test)))

In [3]:
for gd in [25, 50, 100, 200]:
    run_baseline(glove_dimension=gd)

Running model for 25 glove dimension...
Model trained in 42.09 seconds
Model score: 0.693544

Running model for 50 glove dimension...
Model trained in 89.13 seconds
Model score: 0.728328

Running model for 100 glove dimension...
Model trained in 80.54 seconds
Model score: 0.765684

Running model for 200 glove dimension...
Model trained in 189.52 seconds
Model score: 0.784612

