In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 20.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.9 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
 
from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from transformers import AdamW

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/techsoc-analytics-21-22/data/train.csv")

In [None]:
df.shape

(35112, 4)

In [None]:
df.sample(10)

Unnamed: 0,content,title,uid,target_ind
8783,Audio CD.,George Duke - Greatest Hits,B000002ANB,80
5455,A dress classic handcrafted of silky two-ply 8...,Amazon.com: Pinpoint Oxford Tab Collar Button ...,B000288H46,132
18279,Wood Guiro,NINO wood Güiro Natral,B0002F5C4M,221
23634,Russell Women's Cheer Short composed of breath...,Amazon.com: Russell Athletic Women's Cheer Sho...,B0002VMQWM,368
18009,The Eliminator Lighting E144 Is A Multicolored...,Eliminator E-144 Pirahna Mini Double Derby Spe...,B000H2GVDG,191
913,"Machine Screws, also referred to as Machine Bo...","Brass Machine Screw, Round Head, Slotted Drive...",B000FN3KW2,352
31971,The Brute men's Lycra High Cut Wrestling Singl...,Brute Men's Lycra® High Cut Wrestling Singlet,B0000C46Q1,145
33809,Mix some sporty style into your super trendy w...,Converse Mens Chuck Taylor Sneaker,B0001X2SFQ,264
27080,Wire-free cups with three piece cup with botto...,Amazon.com: Cortland Full Figure Soft Cup: Clo...,B000086226,256
26287,"This Viton O-ring is black in color, round in ...","-385 Viton O-Ring, 75A Durometer, Black, 16&#0...",B000FOI2BK,351


In [None]:
df['sentence'] = df['title'] + ' ' +df['content']

sentences = df.sentence.values
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
labels = df['target_ind'].values

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Tokenize the first sentence:
['▁', 'amazon', '.', 'com', ':', '▁', 'w', 'rang', 'ler', '▁men', "'", 's', '▁rugged', '▁wear', '▁relaxed', '▁fit', '▁', 'je', 'an', ':', '▁clothing', '▁premium', '▁quality', '▁five', '▁pocket', '▁', 'je', 'an', '▁from', '▁', 'w', 'rang', 'ler', '▁rugged', '▁wear', '.', '▁this', '▁relaxed', '▁fit', '▁', 'je', 'an', '▁is', '▁made', '▁from', '▁100%', '▁cotton', '▁denim', '▁for', '▁durability', '▁with', '▁extra', '▁room', '▁in', '▁the', '▁seat', '▁and', '▁thigh', '▁for', '▁comfort', '.', '▁men', "'", 's', '▁', 'w', 'rang', 'ler', '▁trail', '▁trek', 'ker', '▁relaxed', '▁fit', '▁jeans', '▁set', '▁out', '▁on', '▁a', '▁long', '▁hike', ',', '▁or', '▁kick', '▁back', '▁for', '▁an', '▁afternoon', '▁full', '▁of', '▁watching', '▁college', '▁football', '▁from', '▁the', '▁comfort', '▁of', '▁your', '▁own', '▁home', '.', '▁these', '▁', 'w', 'rang', 'ler', '▁trail', '▁trek', 'ker', '▁relaxed', '▁fit', '▁jeans', '▁are', '▁up', '▁for', '▁anything', '▁you', '▁are', '!', '▁check

In [None]:
MAX_LEN = 256
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
attention_masks = []

for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1, stratify = labels)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1, stratify = labels)

In [None]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


## Train Model

OK, let's load XLNet! There are a few different pre-trained XLNet models available. "xlnet-base-cased" means the version that has both upper and lowercase letters ("cased") and is the smaller version of the two ("base" vs "large").

In [None]:
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=500)
model.cuda()

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]



In [None]:
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import torch.nn.functional as F
from sklearn.metrics import accuracy_score


train_loss_set = []
model.load_state_dict(torch.load('/content/drive/MyDrive/techsoc-analytics-21-22/weights/xlnet/weights_xlnet_5.pth'))

epochs = 4

for _ in trange(epochs, desc="Epoch"):
  
  
  predictions_labels = []    # predicted labels
  true_labels = []           # true labels
  probs = []                 # predicted probabilities
  
  model.train()
  

  train_loss = 0
  
  
  for step, batch in enumerate(train_dataloader):
    
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch
    
    optimizer.zero_grad()
    
    true_labels += b_labels.flatten().tolist()
    
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    
    loss = outputs[0]
    logits = outputs[1]
    
    train_loss_set.append(loss.item())    
    
    loss.backward()

    optimizer.step()
    
    probs.append(F.softmax(logits, dim = 1))
    logits = logits.detach().cpu().numpy()

    predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    if step%50 == 0:
      acc = accuracy_score(true_labels, predictions_labels)  
      print(f'{step}/{len(train_dataloader)}     train_accuracy = {acc}')


    train_loss += loss.item()

  print("Train loss: {}".format(train_loss/(step+1)))
    
    
  # Validation

 
  model.eval()

   
  eval_accuracy = 0

  
  for batch in validation_dataloader:
    
    batch = tuple(t.to(device) for t in batch)
    
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
      
      output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      logits = output[0]
    
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy

  print("Validation Accuracy: {}".format(eval_accuracy/len(validation_dataloader)))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

0/1975     train_accuracy = 0.625
50/1975     train_accuracy = 0.5686274509803921
100/1975     train_accuracy = 0.556930693069307
150/1975     train_accuracy = 0.5596026490066225
200/1975     train_accuracy = 0.5603233830845771
250/1975     train_accuracy = 0.5587649402390438
300/1975     train_accuracy = 0.5533637873754153
350/1975     train_accuracy = 0.5457621082621082
400/1975     train_accuracy = 0.5447319201995012
450/1975     train_accuracy = 0.5432372505543237
500/1975     train_accuracy = 0.5442864271457086
550/1975     train_accuracy = 0.5460526315789473
600/1975     train_accuracy = 0.5472129783693843
650/1975     train_accuracy = 0.5482910906298003
700/1975     train_accuracy = 0.5493937232524965
750/1975     train_accuracy = 0.5492676431424767
800/1975     train_accuracy = 0.548689138576779
850/1975     train_accuracy = 0.5476645123384254
900/1975     train_accuracy = 0.547031076581576
950/1975     train_accuracy = 0.5456098843322819
1000/1975     train_accuracy = 0.546328

Epoch:  25%|██▌       | 1/4 [1:09:34<3:28:44, 4174.94s/it]

Validation Accuracy: 0.46789772727272727
0/1975     train_accuracy = 0.75
50/1975     train_accuracy = 0.6176470588235294
100/1975     train_accuracy = 0.6157178217821783
150/1975     train_accuracy = 0.6076158940397351
200/1975     train_accuracy = 0.6001243781094527
250/1975     train_accuracy = 0.5908864541832669
300/1975     train_accuracy = 0.589078073089701
350/1975     train_accuracy = 0.5849358974358975
400/1975     train_accuracy = 0.5861907730673317
450/1975     train_accuracy = 0.5834257206208425
500/1975     train_accuracy = 0.5799650698602794
550/1975     train_accuracy = 0.5791742286751361
600/1975     train_accuracy = 0.5780990016638935
650/1975     train_accuracy = 0.5774769585253456
700/1975     train_accuracy = 0.579172610556348
750/1975     train_accuracy = 0.5790612516644474
800/1975     train_accuracy = 0.5768570536828964
850/1975     train_accuracy = 0.5767479435957696
900/1975     train_accuracy = 0.5767896781354052
950/1975     train_accuracy = 0.573606729758149

Epoch:  50%|█████     | 2/4 [2:19:07<2:19:07, 4173.56s/it]

Validation Accuracy: 0.46732954545454547
0/1975     train_accuracy = 0.75
50/1975     train_accuracy = 0.6200980392156863
100/1975     train_accuracy = 0.6107673267326733
150/1975     train_accuracy = 0.6080298013245033
200/1975     train_accuracy = 0.6131840796019901
250/1975     train_accuracy = 0.603585657370518
300/1975     train_accuracy = 0.604235880398671
350/1975     train_accuracy = 0.6087962962962963
400/1975     train_accuracy = 0.6052057356608479
450/1975     train_accuracy = 0.602549889135255
500/1975     train_accuracy = 0.6020459081836327
550/1975     train_accuracy = 0.5984573502722323
600/1975     train_accuracy = 0.5970257903494176
650/1975     train_accuracy = 0.597926267281106
700/1975     train_accuracy = 0.5960235378031383
750/1975     train_accuracy = 0.5943741677762983
800/1975     train_accuracy = 0.5962858926342073
850/1975     train_accuracy = 0.5977526439482961
900/1975     train_accuracy = 0.5962125416204217
950/1975     train_accuracy = 0.5944400630914827


Epoch:  50%|█████     | 2/4 [3:19:33<3:19:33, 5986.73s/it]


KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/techsoc-analytics-21-22/weights/xlnet/weights_xlnet_7.pth')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/techsoc-analytics-21-22/data/test.csv")

df['sentence'] = df['title'] + ' ' + df['content']
sentences = df.sentence.values

sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
labels = [0]*len(df)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


MAX_LEN = 256

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []

for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
  
batch_size = 16  


prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
import torch.nn.functional as F
model.load_state_dict(torch.load('/content/drive/MyDrive/techsoc-analytics-21-22/weights/xlnet/weights_xlnet_8.pth'))

model.eval()


predictions , true_labels = [], []

 
for batch in prediction_dataloader:
  
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask, b_labels = batch
  
  with torch.no_grad():
    
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]

  predictions.append(F.softmax(logits, dim = 1))

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  true_labels.append(label_ids)

In [None]:
probs_xlnet = torch.cat(predictions)
preds_xlnet = torch.argmax(probs_xlnet, dim = 1)
preds_xlnet = np.array(preds_xlnet.cpu())

In [None]:
df = pd.read_csv("/content/drive/MyDrive/techsoc-analytics-21-22/data/test.csv")
submission_weights = pd.DataFrame({'uid': [t for t in df['uid']], 'target_ind': preds_xlnet})
submission_weights.to_csv('submission_xlnet_8.csv')