In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import RobertaModel, RobertaTokenizer

In [None]:
class Settings:
    batch_size=100
    max_len=350
    device = "cuda" if torch.cuda.is_available() else "cpu"
    seed = 768

In [None]:
class TrainValidDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.text = df["tweet"].values
        self.target = df["label"].values
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        texts = self.text[idx]
        tokenized = self.tokenizer.encode_plus(texts, truncation=True, add_special_tokens=True,
                                               max_length=self.max_len, padding="max_length")
        ids = tokenized["input_ids"]
        mask = tokenized["attention_mask"]
        targets = self.target[idx]
        return {
            "ids": torch.LongTensor(ids),
            "mask": torch.LongTensor(mask),
            "targets": torch.tensor(targets, dtype=torch.float32)
        }

In [None]:
class CommonLitRoBERTa(nn.Module):
    def __init__(self, pretrained_path):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(pretrained_path)
        
    def forward(self, ids, mask):
        output = self.roberta(ids, attention_mask=mask)
        return output

In [None]:
model = CommonLitRoBERTa("roberta-base")
model.to(Settings.device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CommonLitRoBERTa(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer

PreTrainedTokenizer(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [None]:
datatweet = pd.read_csv("DataSet.csv")

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
datatweet.label = le.fit_transform(datatweet.label.values)

In [None]:
train, test = train_test_split(datatweet, test_size=0.2, random_state=42)

train.to_csv("train_df.csv")

test.to_csv("test_df.csv")

In [None]:
datatweet.shape

(49, 4)

In [None]:
datatweet.head()

Unnamed: 0,id_akun,label,username,tweet
0,1,0,12290F,its besar salah for me butuh kj to mtk anjing ...
1,2,0,abiyoga,if you like pineapple and you like pizza you s...
2,3,0,adityarestya,mbak analogi nya ting logika pikir nya lebih t...
3,4,0,akhmad arie,iri keren daftar kuis nama ariyanto alamat kot...
4,5,0,aldryandimas,standar ganda salah salah ayo ajar me iya sepe...


In [None]:
df_train = pd.read_csv("train_df.csv")
df_train = df_train.dropna()
df_test = pd.read_csv("test_df.csv")
df_test = df_test.dropna()

In [None]:
# load the datasets

# df_test = pd.read_csv("test_df.csv")

In [None]:
# df_train = pd.read_csv("train_df.csv")

train_dataset = TrainValidDataset(df_train, tokenizer, Settings.max_len)
train_loader = DataLoader(train_dataset, batch_size=Settings.batch_size,
                          shuffle=True, num_workers=8, pin_memory=True)

  cpuset_checked))


In [None]:
# make mini batch data

batch = next(iter(train_loader))

  cpuset_checked))


In [None]:
batch

{'ids': tensor([[    0, 19807,  6713,  ...,  1115,    50,     2],
         [    0, 48883, 36091,  ..., 47510, 44919,     2],
         [    0,  1694, 10905,  ...,  1908,   417,     2],
         ...,
         [    0,   428,  1115,  ...,     7,  2231,     2],
         [    0, 40577, 40577,  ..., 15116,  5186,     2],
         [    0,    90,   808,  ..., 11877,   295,     2]]),
 'mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'targets': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}

In [None]:
ids = batch["ids"].to(Settings.device)
mask = batch["mask"].to(Settings.device)
targets = batch["targets"].to(Settings.device)

print(ids.shape)
print(mask.shape)
print(targets.shape)

torch.Size([39, 350])
torch.Size([39, 350])
torch.Size([39])


In [None]:
output = model(ids, mask)
output

BaseModelOutputWithPoolingAndCrossAttentions([('last_hidden_state',
                                               tensor([[[-0.0497,  0.0501,  0.0395,  ..., -0.1671, -0.0434,  0.0124],
                                                        [ 0.0870,  0.0581, -0.0286,  ..., -0.2379,  0.1798, -0.0914],
                                                        [ 0.0260,  0.0976,  0.0097,  ...,  0.1993, -0.0131,  0.1843],
                                                        ...,
                                                        [ 0.0827,  0.0518,  0.1175,  ..., -0.2603,  0.0658,  0.0021],
                                                        [ 0.1608, -0.2326,  0.1314,  ...,  0.2852, -0.0720,  0.0396],
                                                        [-0.0449,  0.0426,  0.0308,  ..., -0.1954, -0.0366,  0.0015]],
                                               
                                                       [[-0.0485,  0.0495,  0.0350,  ..., -0.1780,  0.0022,  0.039

In [None]:
# last_hidden_state
last_hidden_state = output[0]
print("shape:", last_hidden_state.shape)

shape: torch.Size([39, 350, 768])


In [None]:
# pooler output
pooler_output = output[1]
print("shape:", pooler_output.shape)

shape: torch.Size([39, 768])


In [None]:
cls_embeddings = last_hidden_state[:, 0, :].detach()

print("shape:", cls_embeddings.shape)
print("")
print(cls_embeddings)

shape: torch.Size([39, 768])

tensor([[-0.0497,  0.0501,  0.0395,  ..., -0.1671, -0.0434,  0.0124],
        [-0.0485,  0.0495,  0.0350,  ..., -0.1780,  0.0022,  0.0390],
        [-0.0329,  0.0188,  0.0410,  ..., -0.1831, -0.0335,  0.0314],
        ...,
        [-0.0550,  0.0254,  0.0378,  ..., -0.1903, -0.0359,  0.0226],
        [-0.0437,  0.0516,  0.0565,  ..., -0.1419, -0.0319,  0.0361],
        [-0.0575,  0.0030,  0.0341,  ..., -0.2018, -0.0151, -0.0086]])


In [None]:
pd.DataFrame(cls_embeddings.cpu().numpy()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.049687,0.050111,0.039524,-0.117616,0.009222,-0.191291,0.015327,0.016254,0.013182,-0.019113,...,0.100256,0.028469,-0.131216,-0.079339,-0.059993,-0.031143,0.159584,-0.167118,-0.043415,0.012353
1,-0.048485,0.049538,0.034984,-0.092724,-0.04046,-0.223916,0.015657,0.03955,0.01093,-0.044281,...,0.104432,0.041962,-0.114383,-0.088692,-0.047583,-0.014563,0.167224,-0.178036,0.002216,0.039033
2,-0.032903,0.018809,0.041032,-0.0978,0.013628,-0.187409,0.00683,0.082632,0.023073,-0.072869,...,0.094917,0.050176,-0.161191,-0.073268,-0.104327,0.006934,0.137702,-0.1831,-0.033492,0.031351
3,-0.031474,0.018772,0.032856,-0.106333,0.015122,-0.18231,-0.005653,0.050389,0.034509,-0.052329,...,0.1055,0.055012,-0.15469,-0.107367,-0.056426,0.015322,0.178858,-0.215475,-0.033499,0.010702
4,-0.052631,0.030343,0.03942,-0.082525,-0.032423,-0.194593,0.006505,0.02578,0.027278,-0.042045,...,0.101105,0.042587,-0.091588,-0.060145,-0.059455,-0.007481,0.155557,-0.211936,-0.053768,-0.016025


In [None]:
pd.DataFrame(cls_embeddings.cpu().numpy()).shape

(39, 768)

In [None]:
last_hidden_state.shape

torch.Size([39, 350, 768])

In [None]:
# apply avg.pooling to word embeddings
# 単語埋め込みベクトルにaverage pooling を適用する
pooled_embeddings = last_hidden_state.detach().mean(dim=1)

print("shape:", pooled_embeddings.shape)
print("")
print(pooled_embeddings)

shape: torch.Size([39, 768])

tensor([[-0.0865,  0.0199,  0.0868,  ..., -0.2682, -0.0441, -0.0028],
        [-0.0539,  0.0589,  0.0742,  ..., -0.2769,  0.0161,  0.0616],
        [ 0.0096, -0.0109,  0.1060,  ..., -0.3105, -0.0092,  0.0276],
        ...,
        [-0.0581,  0.0028,  0.1000,  ..., -0.3127, -0.0024,  0.0096],
        [-0.0078,  0.0389,  0.1051,  ..., -0.1949, -0.0029,  0.0579],
        [-0.0689,  0.0043,  0.0958,  ..., -0.3116,  0.0035,  0.0054]])


In [None]:
pd.DataFrame(pooled_embeddings.cpu().numpy()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.086476,0.019859,0.086814,0.118813,-0.086344,-0.291477,-0.056506,0.047834,-0.001696,-0.017935,...,0.182298,-0.03234,-0.194552,0.054748,-0.062074,-0.205619,0.318559,-0.268161,-0.044116,-0.002827
1,-0.05392,0.058911,0.074153,0.092516,-0.224913,-0.280537,-0.008037,0.036741,-0.009352,-0.043444,...,0.114547,0.013849,-0.131253,0.027463,-0.069676,-0.157081,0.359347,-0.276895,0.016094,0.061609
2,0.009632,-0.010867,0.106044,0.101713,-0.078762,-0.20516,-0.001309,0.106982,0.034212,-0.044682,...,0.094439,0.016506,-0.151334,0.037902,-0.120236,-0.121229,0.344951,-0.310451,-0.0092,0.02764
3,-0.075851,-0.028208,0.08366,0.123008,-0.100873,-0.229472,-0.039923,0.03018,-0.010829,-0.030644,...,0.14372,0.034368,-0.238438,0.0202,-0.074293,-0.142198,0.386423,-0.360985,-0.013981,-0.013176
4,-0.068081,0.044345,0.106599,0.117576,-0.236935,-0.240149,-0.052856,0.050336,-0.007994,-0.036271,...,0.153078,-0.01007,-0.166419,0.03915,-0.072276,-0.174403,0.278976,-0.307429,-0.040699,-0.012276


In [None]:
pd.DataFrame(pooled_embeddings.cpu().numpy()).shape

(39, 768)

In [None]:
pd.DataFrame(pooled_embeddings.cpu().numpy()).to_csv("roberta_embeddings.csv")