In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

In [2]:
#!pip install torch torchvision -f https://download.pytorch.org/whl/cu102/torch_stable.html

In [3]:
if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

GPU is not available.


In [4]:
df_m=pd.read_csv('masked_nike.csv')

In [5]:
df_m

Unnamed: 0,masked_text,target_word
0,[Mask] comfort [Mask] quality doubt weather fa...,good
1,thought buying new shoe supposedly new shoe gu...,near
2,really [Mask] product nike comfortable wear lo...,good
3,comfortable shoe wearwhich made good quality l...,attracts
4,pleae dont buy neither looks like nike shoes ...,cheap
...,...,...
806,earlier [Mask] serveral orders nikestoredk web...,placed
807,couldnt believe somebody designed trainers ord...,accepted
808,[Mask]ed items unfortunately fit nike gave ful...,order
809,recieved fantastic pair [Mask] brand,trainers


In [6]:
def get_text_length(text):
    return len(text.split())

df_m['text_length'] = df_m['masked_text'].apply(get_text_length)
df_m = df_m[df_m['text_length'] <= 50]

In [7]:
df_m[df_m['text_length'] > 50]

Unnamed: 0,masked_text,target_word,text_length


In [8]:
df_m.shape

(747, 3)

In [9]:
df_m=df_m[['masked_text','target_word']]

In [10]:
df_m.head()

Unnamed: 0,masked_text,target_word
0,[Mask] comfort [Mask] quality doubt weather fa...,good
1,thought buying new shoe supposedly new shoe gu...,near
2,really [Mask] product nike comfortable wear lo...,good
3,comfortable shoe wearwhich made good quality l...,attracts
4,pleae dont buy neither looks like nike shoes ...,cheap


# BUILD MODEL

step for building model
1. Preprocess our text data: Convert the text into a format suitable for fine-tuning, such as converting all text to lowercase, removing stop words/depends on use case, punctuation and special characters, etc.

2. Mask the text: Replace a word in the text with the [Mask] placeholder to create a masked sequence. we can choose to mask a random word in each sequence or mask a specific word based on your task.

3. Create the target column: Create a target column that contains the masked word from the text column.

4. Convert the data into a format suitable for fine-tuning: Convert the text and target columns into numerical data suitable for fine-tuning the model.

5. Split the data into train, validation, and test sets: Split the preprocessed and converted data into three sets: training, validation, and testing.

6. Load the distilbert-based-uncased model: Load the pre-trained distilbert-based-uncased model from the transformers library.

7. Fine-tune the model: Train the model on the training data and use the validation data to adjust the model parameters for optimal performance.

8. Evaluate the model: Use the test data to evaluate the model's performance and make any necessary adjustments.

9. Save the fine-tuned model: Save the fine-tuned model for future use.

In [11]:
df_m.reset_index(drop=True)

Unnamed: 0,masked_text,target_word
0,[Mask] comfort [Mask] quality doubt weather fa...,good
1,thought buying new shoe supposedly new shoe gu...,near
2,really [Mask] product nike comfortable wear lo...,good
3,comfortable shoe wearwhich made good quality l...,attracts
4,pleae dont buy neither looks like nike shoes ...,cheap
...,...,...
742,ordered [Mask] pair shoes heartrate monitor\re...,gpsclock
743,headaches return products online although comp...,marketing
744,couldnt believe somebody designed trainers ord...,accepted
745,recieved fantastic pair [Mask] brand,trainers


# LOAD PRETRAINED MODEL

In [12]:
#step1 ,step2 and step3 is already 
#step4
from transformers import DistilBertTokenizer

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")




# CREATE INPUT DATA(TENSOR)

In [13]:
# Convert the text and target columns into numerical data
encoded_texts = [tokenizer.encode(text, add_special_tokens=True) for text in df_m['masked_text']]
encoded_targets = [tokenizer.encode(target, add_special_tokens=True) for target in df_m['target_word']]


In [14]:
encoded_texts = [text[:70] for text in encoded_texts]#ONLY COLLECTING THEIR MAXIMUM LENGTH
encoded_targets=[text[:70] for text in encoded_targets]

In [15]:
# Pad the encoded data to the same length
max_len = max(len(text) for text in encoded_texts)
padded_texts = [text + [tokenizer.pad_token_id] * (max_len - len(text)) for text in encoded_texts]
padded_targets = [target + [tokenizer.pad_token_id] * (max_len - len(target)) for target in encoded_targets]

In [16]:
max_len

69

# SPLITTING DATA

In [17]:
from sklearn.model_selection import train_test_split

# Split the data into training, validation, and testing sets
train_texts, test_texts, train_targets, test_targets = train_test_split(padded_texts, padded_targets, test_size=0.2, random_state=42)
train_texts, val_texts, train_targets, val_targets = train_test_split(train_texts, train_targets, test_size=0.15, random_state=42)


In [18]:
import transformers
from transformers import DistilBertForMaskedLM, DistilBertTokenizer
# Load the DistilBERT model
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')


In [19]:
import torch
#model = model.to('cuda') # move the model to GPU if you have one

In [20]:
# Convert the input data into tensors
train_texts = torch.tensor(train_texts)
train_targets = torch.tensor(train_targets)
test_texts = torch.tensor(test_texts)
test_targets = torch.tensor(test_targets)

In [21]:
# Train the model
model.train()

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inp

In [22]:
torch.cuda.empty_cache()
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:<512>"

In [23]:
# Forward pass
logits = model(train_texts, attention_mask=train_targets)


In [24]:
len(logits[0]),

(507,)

In [25]:
# Compute the loss
criterion = torch.nn.CrossEntropyLoss()


In [27]:
loss = criterion(logits[0][0:10], train_targets[0:10])

RuntimeError: Expected target size [10, 30522], got [10, 69]

***
 a cross-entropy loss function is defined using the torch.nn.CrossEntropyLoss class.
 The forward pass is performed using the model object, and the result is stored in the logits variable. 
 The loss is then computed by passing the logits and the ground-truth labels (train_targets) to the criterion object.
 ***

In [None]:
# Compute the loss and optimize the model parameters
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss.backward()
optimizer.step()