## All installations done here

In [1]:
pip install transformers



## All imports done here

In [2]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow
# Set the seed for current session so that little change occurs on each DL run
tensorflow.random.set_seed(2)
session_conf = tensorflow.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tensorflow.compat.v1.Session(graph=tensorflow.compat.v1.get_default_graph(), config=session_conf)
tensorflow.compat.v1.keras.backend.set_session(sess)

from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, GlobalAveragePooling1D, GlobalMaxPooling1D, Concatenate, Dense
from tensorflow.keras.layers import Activation, RepeatVector, Permute, Lambda, Dropout, Multiply, Conv1D, BatchNormalization
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, RobertaTokenizer, TFRobertaModel, BertConfig, TFAlbertModel, AlbertTokenizer

## Define all functions here

In [3]:
# Replace latex math symbols with the special word "math_equation"
def convert_latex_eqn_to_word(text):
  while True:
    if "$" in text and text.count("$") > 1:
      start_index = text.index("$")
      end_index = text.index("$", start_index+1)
      text = text[:start_index] + "math_equation" + text[end_index+1:]
    else:
      return text

# BERT model and corresponding tokenizer need to be downloaded. How?
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") -> Downloads tokenizer
# bert_model = TFBertModel.from_pretrained("bert-base-uncased") -> Downloads model

# Once download is complete, save both model and tokenzier. How?
# tokenizer.save_pretrained(path_to_output_directory)
# bert_model.save_pretrained(path_to_output_directory)

# Generate input for BERT, ALBERT and ROBERTA
def create_BERT_input(df, value):
    max_len = 200
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    # tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    # tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    input_ids, input_masks, input_segments = list(), list(), list()

    # Create BERT input
    for val in tqdm(df[value].values):
        tokenized_context = tokenizer.encode_plus(val, add_special_tokens=True, max_length=max_len, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=True, truncation=True)
        input_ids.append(tokenized_context["input_ids"])
        input_masks.append(tokenized_context["attention_mask"])
        input_segments.append(tokenized_context["token_type_ids"])
    input_ids, input_masks, input_segments = np.asarray(input_ids, dtype=np.int32), np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)
    print(input_ids.shape, input_masks.shape, input_segments.shape)
    
    return input_ids, input_masks, input_segments


# Saves the predictions of provided model
def save_preds(model, epoch):
  global df_test, X_tfidf_test, input_ids_test, input_masks_test, input_segments_test
  # Create input for the model
  preds = model.predict([input_ids_test, input_masks_test, input_segments_test, X_tfidf_test], verbose=1)

  fp = open("/content/drive/My Drive/JanataHack_IndependenceDay/submit_proba_"+str(epoch)+".csv", "w")
  fp.write("ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance\n")
  for id_, pred in zip(df_test["ID"].values, preds):
    fp.write(str(id_)+","+",".join([str(i) for i in pred])+"\n")
  fp.close()

  fp = open("/content/drive/My Drive/JanataHack_IndependenceDay/submit_"+str(epoch)+".csv", "w")
  fp.write("ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance\n")
  for id_, pred in zip(df_test["ID"].values, preds):
    fp.write(str(id_)+","+",".join(["1" if i>=0.5 else "0" for i in pred])+"\n")
  fp.close()

# Callback to save model's predictions after each epoch
class CustomCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
      # if epoch%2 == 0 and epoch!=0:
      if epoch!=0:
        save_preds(self.model, epoch)

## Load and clean the train and test files

In [4]:
# Augmented text
df_train = pd.read_csv("/content/drive/My Drive/JanataHack_IndependenceDay/train.csv")
df_test = pd.read_csv("/content/drive/My Drive/JanataHack_IndependenceDay/test.csv")

df_tfidf_train = pd.read_csv("/content/drive/My Drive/JanataHack_IndependenceDay/best_tfidf_train.csv")
df_tfidf_test = pd.read_csv("/content/drive/My Drive/JanataHack_IndependenceDay/best_tfidf_test.csv")

# Concatenate title and abstract
df_train["Concat"] = df_train["TITLE"].str.cat(df_train["ABSTRACT"].str.strip(), sep=" [SEP] ")
df_test["Concat"] = df_test["TITLE"].str.cat(df_test["ABSTRACT"].str.strip(), sep=" [SEP] ")

# Remove unnecessary shit
df_train["Concat"] = df_train["Concat"].str.strip()
df_test["Concat"] = df_test["Concat"].str.strip()

# Lowercase
df_train["Concat"] = df_train["Concat"].str.lower()
df_test["Concat"] = df_test["Concat"].str.lower()

# Seems like the data was scraped from a pdf or something cause there are random \n characters in a sentence. Remove them
df_train["Concat"] = df_train["Concat"].str.replace("\\n", " ")
df_test["Concat"] = df_test["Concat"].str.replace("\\n", " ")

# Looks like latex was used to write these articles as there are latex equations. Replace them with some special token
df_train["Concat"] = [convert_latex_eqn_to_word(text) for text in df_train["Concat"]]
df_test["Concat"] = [convert_latex_eqn_to_word(text) for text in df_test["Concat"]]

df_train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,Concat
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,reconstructing subject-specific effect maps [s...
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,rotation invariance neural network [sep] rotat...
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0,spherical polyharmonics and poisson kernels fo...
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0,a finite element approximation for the stochas...
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0,comparative study of discrete wavelet transfor...


## Generate input for pre-trained model and prep it up

In [5]:
classes = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]
Y = df_train[classes].values
X_tfidf_train, X_tfidf_test = df_tfidf_train.values[:,1:], df_tfidf_test.values[:,1:]
epoch = 7

# Create input for the model
input_ids, input_masks, input_segments = create_BERT_input(df_train, "Concat")
input_ids_test, input_masks_test, input_segments_test = create_BERT_input(df_test, "Concat")

# Load the pretrained model
bert_model = TFBertModel.from_pretrained("bert-base-uncased")
# bert_model = TFRobertaModel.from_pretrained("roberta-base")
# bert_model = TFAlbertModel.from_pretrained("albert-base-v2")

HBox(children=(FloatProgress(value=0.0, max=20972.0), HTML(value='')))


(20972, 200) (20972, 200) (20972, 200)


HBox(children=(FloatProgress(value=0.0, max=8989.0), HTML(value='')))


(8989, 200) (8989, 200) (8989, 200)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


## Generate the architecture, train and save

In [6]:
# Define the model
input_ids1 = Input(shape=(input_ids.shape[1],), dtype=tensorflow.int32)
token_type_ids = Input(shape=(input_masks.shape[1],), dtype=tensorflow.int32)
attention_mask = Input(shape=(input_segments.shape[1],), dtype=tensorflow.int32)
input3 = Input(shape=(X_tfidf_train.shape[1],), dtype=tensorflow.float32)

# embedding[0] gives token vector. embedding[1] gives pooled vector (single vector representing entire input sequence)
# There are many ways of pooling embedding[0] and combining with embedding[1]. So we are going to try them all out :P
embedding = bert_model(input_ids1, attention_mask=attention_mask, token_type_ids=token_type_ids)

# Additive attention to generate sentence level representation from embedding[0]
attention = Dense(1, activation='tanh')(embedding[0])
attention = Flatten()(attention)
attention = Activation('softmax')(attention)
attention = RepeatVector(768)(attention)
attention = Permute([2, 1])(attention)
sent_representation = Multiply()([embedding[0], attention])
sent_representation = Lambda(lambda xin: K.sum(xin, axis=-2), output_shape=(768,))(sent_representation)

# Simple Pooling of embedding[0]
x1 = GlobalMaxPooling1D()(embedding[0])
x2 = GlobalAveragePooling1D()(embedding[0])

# Concatenate them all
x1 = Concatenate()([x1, x2, sent_representation, embedding[1]])
x1 = BatchNormalization()(x1)
# x1 = Dropout(0.4)(x1)
x1 = Dense(64, activation="relu")(x1)

# We also concatenate the tfidf scores generated from the Generate_TFIDF_CountVec_Stacked_Scores
x1 = Concatenate()([x1, input3])
x1 = BatchNormalization()(x1)
# x1 = Dropout(0.1)(x1)

# Another way of using embedding[0] is to train a CNN on top of the output token vectors
# 1D CNNs have been reported to work really well
x2 = Conv1D(256, 1)(embedding[0])
x2 = GlobalMaxPooling1D()(x2)

# Finally concatenate everything together
x1 = Concatenate()([x1, x2])
x1 = BatchNormalization()(x1)
# x1 = Dropout(0.2)(x1)
x1 = BatchNormalization()(x1)

# Final output
out = Dense(Y.shape[1], activation="sigmoid")(x1)

model = Model(inputs=[input_ids1, attention_mask, token_type_ids, input3], outputs=out)
model.summary()
# DONT TRAIN THE PRETRAINED MODEL
model.layers[3].trainable = False

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit([input_ids, input_masks, input_segments, X_tfidf_train], Y, batch_size=16, epochs=epoch, validation_split=0.1, callbacks=[CustomCallback()])

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 200, 768), ( 109482240   input_1[0][0]                    
                                                                 input_3[0][0]         

KeyboardInterrupt: ignored

In [None]:
# Just make a final save call
save_preds(model, epoch)