### Import data

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/Train.csv")
tags = pd.read_csv("data/Tags.csv")
test_data = pd.read_csv("data/Test.csv")

In [2]:
TOPIC_COLS = ['Computer Science','Mathematics','Physics','Statistics']
TAGS = list(tags['Tags'])

### Preprocessing

In [3]:
train_data['ABSTRACT'] = train_data['ABSTRACT'].str.lower()
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.lower()

In [4]:
import string
train_data['ABSTRACT'] = train_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))

### Test Train Split

In [5]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_data,test_size=0.2,random_state=2)

### Common Functions

In [6]:
from sklearn.metrics import f1_score

# Get best threshold for each label
def get_cut_offthreshold(y_pred_prob,validation_set,TAGS):
    thresholds = np.array(list(range(0,100)))/100.0
    best_thresholds = []
    for idx in range(0,25):
        scores = [f1_score(validation_set[TAGS[idx]], y_pred_prob[:,idx] > thresh, average='micro') for thresh in thresholds]
        best_thresh = thresholds[np.argmax(scores)]
        best_thresholds.append(best_thresh)
    return best_thresholds

# Get predictions based on probabilities and class specific thresholds
def get_predictions(pred_prob,best_thresholds,TAGS):
    predictions = np.zeros((pred_prob.shape[0],len(TAGS)))
    for idx in range(0,25):
        predictions[:,idx] = pred_prob[:,idx] > best_thresholds[idx]    
    return predictions

### Universal Sentence Embedding with Keras

In [7]:
import time
import tensorflow as tf
import tensorflow_hub as hub

TF_MODULE_URL= "https://tfhub.dev/google/universal-sentence-encoder/4"

def universal_sent_embedding():
    '''
    Get the universal sentence encoder from TFHUB. Takes time to load, so invoke once
    and use everywhere else.
    '''
    t_start = time.time()
    # tf.disable_eager_execution()
    # embed_ = hub.Module(cfg["TF_MODULE_URL"])
    embed_ = hub.load(TF_MODULE_URL)
    t_end = time.time()
    print("USE module loaded in {} secs".format(str(t_end-t_start)))
    return embed_


def get_uni_sent_embedding(sents,embed):
    '''
    Get embeddings using the universal sentence encoder
    '''  
    t_start = time.time()
    sent_embeddings = embed(sents)
    # with tf.Session() as session:
    #     session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    #     sent_embeddings = session.run(embed(sents))
    print("Shape of sentence embeddings {0}".format(str(sent_embeddings.shape)))
    t_end = time.time()
    return sent_embeddings

def keras_use_embedding_model():
    '''
    Use universal sentence encoder in the embedding layer of a Keras model
    '''    
    embedding = TF_MODULE_URL
    hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                               dtype=tf.string, trainable=False)
    model = tf.keras.Sequential()
    model.add(hub_layer)
    model.add(tf.keras.layers.Dense(100, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(25,activation='sigmoid'))
    print(model.summary())
    model.compile(loss='binary_crossentropy',
        optimizer='adam', metrics=['accuracy'])
    return model

def keras_use_embedding_model_train(keras_model,train_sents,train_labels,test_sents,test_labels):
    '''
    Train the keras models. Change parameters here if needed
    '''
    n_epochs = 50
    n_batch_size = 32
    # session = tf.Session()
    # tf.keras.backend.set_session(session)
    # session.run(tf.global_variables_initializer())
    # session.run(tf.tables_initializer())
    history = keras_model.fit(train_sents,
            train_labels,
            validation_data=(test_sents, test_labels),
            epochs=n_epochs,
            batch_size=n_batch_size)
    return keras_model

In [10]:
model = keras_use_embedding_model()

model = keras_use_embedding_model_train(model,np.array(list(train['ABSTRACT'])),
                                        np.array(train[TAGS]),np.array(list(val['ABSTRACT'])),np.array(val[TAGS]))



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_1 (KerasLayer)   (None, 512)               256797824 
_________________________________________________________________
dense_2 (Dense)              (None, 100)               51300     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 25)                2525      
Total params: 256,851,649
Trainable params: 53,825
Non-trainable params: 256,797,824
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/

In [11]:
y_pred_prob =  model.predict(np.array(list(val['ABSTRACT'])))
best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))

F1 Score on Validation Set 0.6413059757376067


In [12]:
# Predicting on Test

y_pred_test_prob = model.predict(np.array(list(test_data['ABSTRACT'])))

predictions = get_predictions(y_pred_test_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']
result.to_csv("universal_sent_encoder_keras.csv",index=False)

### Universal Sentence Embedding with Keras with topic columns

In [8]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate, Dense, Input, Dropout,Concatenate

def keras_use_embedding_model():
    '''
    Use universal sentence encoder in the embedding layer of a Keras model along wiht topic columns
    '''
    embedding = TF_MODULE_URL
    input_words = Input(shape=[],dtype=tf.string)
    hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                               dtype=tf.string, trainable=False)
    out_emb = hub_layer(input_words)
    tc = Input(shape=(4,))
    tc_dense = Dense(4,)(tc)
    cc = Concatenate(axis=1)([out_emb,tc_dense])
    model = Dense(100, activation='relu')(cc)
    model = Dropout(0.5)(model)
    output = Dense(25,activation='sigmoid')(model)
    model = Model(inputs=[input_words,tc],outputs=output)
    print(model.summary())
    model.compile(loss='binary_crossentropy',
        optimizer='adam', metrics=['accuracy'])
    return model

def keras_use_embedding_model_train(keras_model,train_sents,train_topic_cols,train_labels,
                                    test_sents,test_topic_cols,test_labels):
    '''
    Train the keras models. Change parameters here if needed
    '''
    n_epochs = 50
    n_batch_size = 32
    # session = tf.Session()
    # tf.keras.backend.set_session(session)
    # session.run(tf.global_variables_initializer())
    # session.run(tf.tables_initializer())
    history = keras_model.fit([train_sents,train_topic_cols],
            train_labels,
            validation_data=([test_sents,test_topic_cols], test_labels),
            epochs=n_epochs,
            batch_size=n_batch_size)
    return keras_model

In [9]:
model = keras_use_embedding_model()

train_topic_cols = np.array(train[TOPIC_COLS])
val_topic_cols = np.array(val[TOPIC_COLS])
train_labels = np.array(train[TAGS])
val_labels = np.array(val[TAGS])

model = keras_use_embedding_model_train(model,np.array(list(train['ABSTRACT'])),train_topic_cols,train_labels,np.array(list(val['ABSTRACT'])),val_topic_cols,val_labels)


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        (None, 512)          256797824   input_1[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 4)            20          input_2[0][0]                    
_______________________________________________________________________________________

Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [10]:
y_pred_prob =  model.predict([np.array(list(val['ABSTRACT'])),val_topic_cols])
best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))

F1 Score on Validation Set 0.7207574654042243


In [11]:
# Predicting on Test

y_pred_test_prob = model.predict([np.array(list(test_data['ABSTRACT'])),np.array(test_data[TOPIC_COLS])])

predictions = get_predictions(y_pred_test_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']
result.to_csv("universal_sent_encoder_keras_topic_cols.csv",index=False)