In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from IPython.display import clear_output

!pip install pytorch_transformers
!pip install transformers

clear_output()

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from pytorch_transformers import XLNetTokenizer
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import XLNetModel, XLNetForSequenceClassification
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from keras.preprocessing.sequence import pad_sequences

In [4]:
# Load the trained model and test data
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
df = pd.read_csv("/content/drive/MyDrive/techsoc-analytics-21-22/data/test.csv")

tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizer_xlnet = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')

model_bert = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 500,    
    output_attentions = False, 
    output_hidden_states = False, 
)

model_bert.to(device)
model_bert.load_state_dict(torch.load('/content/drive/MyDrive/techsoc-analytics-21-22/weights/bert/weights_bert_epochs7.pth'))

model_roberta = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", 
    num_labels = 500,    
    output_attentions = False, 
    output_hidden_states = False, 
)
model_roberta.to(device)
model_roberta.load_state_dict(torch.load('/content/drive/MyDrive/techsoc-analytics-21-22/weights/roberta/weights_roberta_epochs8.pth'))


model_xlnet = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=500)
model_xlnet.to(device)
model_xlnet.load_state_dict(torch.load('/content/drive/MyDrive/techsoc-analytics-21-22/weights/xlnet/weights_xlnet_8.pth'))

clear_output()

In [5]:
def predict (model, dataloader):
  
  model.eval()
  predictions  = []

  for batch in dataloader:

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        # Forward pass, calculate logit predictions.
        result = model(b_input_ids, 
                      token_type_ids=None, 
                      attention_mask=b_input_mask,
                      return_dict=True)
    logits = result.logits

  
    # Move logits and labels to CPU
    predictions.append(F.softmax(logits, dim = 1))

  return torch.cat(predictions)


In [7]:
%%time

print("Code for Inference")

###########################################   PREPROCESSING   ################################################################################
df['info'] = df['title'] + df['content']

sentences_bert = df['info'].values
#sentences_xlnet = [sentence + " [SEP] [CLS]" for sentence in sentences_bert]
labels = [0]*len(df)

####################################   INPUT ID'S AND ATTENTION MASKS   #######################################################################

#tokenized_texts_xlnet = [tokenizer_xlnet.tokenize(sent) for sent in sentences_xlnet]

MAX_LEN = 256
'''
input_ids_xlnet = [tokenizer_xlnet.convert_tokens_to_ids(x) for x in tokenized_texts_xlnet]
input_ids_xlnet = pad_sequences(input_ids_xlnet, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks_xlnet = []

for seq in input_ids_xlnet:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_xlnet.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids_xlnet)
prediction_masks = torch.tensor(attention_masks_xlnet)
prediction_labels = torch.tensor(labels)

'''
input_ids_bert = []
attention_masks_bert = []
input_ids_roberta = []
attention_masks_roberta = []


for sent in sentences_bert:
    
    encoded_dict_bert = tokenizer_bert.encode_plus(sent, add_special_tokens = True,max_length = 256, padding = 'max_length', truncation = True, 
                                                    return_attention_mask = True, return_tensors = 'pt',     )
    input_ids_bert.append(encoded_dict_bert['input_ids'])
    attention_masks_bert.append(encoded_dict_bert['attention_mask'])
    
    encoded_dict_roberta = tokenizer_roberta.encode_plus(sent, add_special_tokens = True,max_length = 256, padding = 'max_length', 
                                                         truncation = True, return_attention_mask = True, return_tensors = 'pt',     )
    input_ids_roberta.append(encoded_dict_roberta['input_ids'])
    attention_masks_roberta.append(encoded_dict_roberta['attention_mask'])

    
input_ids_bert = torch.cat(input_ids_bert, dim=0)
attention_masks_bert = torch.cat(attention_masks_bert, dim=0)

input_ids_roberta = torch.cat(input_ids_roberta, dim=0)
attention_masks_roberta = torch.cat(attention_masks_roberta, dim=0)

labels = torch.tensor(labels)

##################################################   CREATING THE DATALOADER   #######################################################################
batch_size = 16
'''
prediction_data_xlnet = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler_xlnet = SequentialSampler(prediction_data_xlnet)
prediction_dataloader_xlnet = DataLoader(prediction_data_xlnet, sampler=prediction_sampler_xlnet, batch_size=batch_size)
'''
prediction_data_bert = TensorDataset(input_ids_bert, attention_masks_bert, labels)
prediction_sampler_bert = SequentialSampler(prediction_data_bert)
prediction_dataloader_bert = DataLoader(prediction_data_bert, sampler=prediction_sampler_bert, batch_size=batch_size)

prediction_data_roberta = TensorDataset(input_ids_roberta, attention_masks_roberta, labels)
prediction_sampler_roberta = SequentialSampler(prediction_data_roberta)
prediction_dataloader_roberta = DataLoader(prediction_data_roberta, sampler=prediction_sampler_roberta, batch_size=batch_size)

##################################################### PREDICTING ################################################################

probs_bert = predict(model_bert, prediction_dataloader_bert)
#probs_xlnet = predict(model_xlnet, prediction_dataloader_xlnet)
probs_roberta = predict(model_roberta, prediction_dataloader_roberta)

probs_final= probs_bert + probs_roberta
preds_final = torch.argmax(probs_final, dim = 1)

# Full Code for running your inference (including any preprocessing you need to do on the test set)
# In this Cell as a comment also mention the CPU and GPU of the system you are using to run this inference
# CPU: 
# GPU: 

Code for Inference
CPU times: user 9min 23s, sys: 943 ms, total: 9min 24s
Wall time: 9min 22s


In [None]:
# Save Submission File
preds_final = np.array(preds_final.cpu())
submission = pd.DataFrame({'uid': [t for t in df['uid']], 'target_ind': preds_final})
submission.to_csv('final_submission.csv')