In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
!pip install nltk
!pip install transformers
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# 1. 데이터 불러오기

In [10]:
df = pd.read_csv('/content/drive/MyDrive/TM/Data/Bert1/Labeling1.csv', index_col = 0)
df.head()

Unnamed: 0,sentence,cat
0,Would be super sexy if you are only about 4 ft...,1.0
1,"I'm 5'5"" and this barely stretched to the unde...",1.0
2,Don't waste your $,3.0
3,Tiny fit and terrible print!,3.0
4,Got it for my brother who normally wears a large.,1.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2366 entries, 0 to 499
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sentence  2366 non-null   object 
 1   cat       2366 non-null   float64
dtypes: float64(1), object(1)
memory usage: 55.5+ KB


# 2. 전처리


In [12]:
sw = stopwords.words('english')

In [13]:
def clean_text(text):

    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r",", " ",text)
    html = re.compile(r'<.*?>')
    text = html.sub(r'',text)

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'')

    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)

    return text

In [14]:
df['sentence'] = df['sentence'].apply(lambda x: clean_text(x))

In [15]:
df['cat'] = df['cat'].apply(lambda x : 4 if x == -1 else x)

In [16]:
df.head()

Unnamed: 0,sentence,cat
0,would super sexy ft tall,1.0
1,barely stretched underside boobs,1.0
2,waste,3.0
3,tiny fit terrible print,3.0
4,got brother normally wears large,1.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2366 entries, 0 to 499
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sentence  2366 non-null   object 
 1   cat       2366 non-null   float64
dtypes: float64(1), object(1)
memory usage: 55.5+ KB


In [18]:
df.cat.unique()

array([1., 3., 0., 4., 2.])

In [19]:
train_sentence = df.sentence.values
train_labels = df.cat.values.astype('int')

# 3. 토큰화

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [21]:
max_len = 50

In [22]:
def MyTokenize(sentence):
  input_ids = []
  attention_masks = []

  for sent in sentence:
      encoded_dict = tokenizer.encode_plus(
                          sent,
                          add_special_tokens = True,
                          max_length = max_len,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',
                    )

      input_ids.append(encoded_dict['input_ids'])

      attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim=0)

  attention_masks = torch.cat(attention_masks, dim=0)

  return input_ids, attention_masks

In [23]:
train_labels = torch.tensor(train_labels)

In [24]:
train_input_ids, train_attention_masks = MyTokenize(train_sentence)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# 4. 모델 학습

In [25]:
dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)

train_size = int(0.8 * len(dataset))

val_size = len(dataset)  - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

1,892 training samples
  474 validation samples


In [26]:
batch_size = 32

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [27]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 5,
    output_attentions = False,
    output_hidden_states = False,
)

model = model.to(device)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [28]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )



In [29]:
epochs = 10

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [30]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [31]:
def format_time(elapsed):

    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))

In [32]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        optimizer.zero_grad()
        output = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        loss = output.loss
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")
    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()

        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)


    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model.state_dict(), '/content/drive/MyDrive/TM/Data/Bert1/Best_Bert1.pth')
        best_eval_accuracy = avg_val_accuracy

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 1.24
  Training epcoh took: 0:00:18

Running Validation...
  Accuracy: 0.58

Training...

  Average training loss: 0.89
  Training epcoh took: 0:00:14

Running Validation...
  Accuracy: 0.64

Training...

  Average training loss: 0.70
  Training epcoh took: 0:00:14

Running Validation...
  Accuracy: 0.64

Training...

  Average training loss: 0.56
  Training epcoh took: 0:00:15

Running Validation...
  Accuracy: 0.63

Training...

  Average training loss: 0.44
  Training epcoh took: 0:00:15

Running Validation...
  Accuracy: 0.66

Training...

  Average training loss: 0.35
  Training epcoh took: 0:00:15

Running Validation...
  Accuracy: 0.65

Training...

  Average training loss: 0.28
  Training epcoh took: 0:00:15

Running Validation...
  Accuracy: 0.63

Training...

  Average training loss: 0.22
  Training epcoh took: 0:00:15

Running Validation...
  Accuracy: 0.63

Training...

  Average training loss: 0.19
  Training epcoh took: 0:00:15

Runn

# 6. 모델 테스트

In [33]:
test_dataset  = pd.read_csv('/content/drive/MyDrive/TM/Data/Sentence.csv', index_col = 0)
test_dataset.head()

Unnamed: 0_level_0,sentence
column_num,Unnamed: 1_level_1
0,Would be super sexy if you are only about 4 ft...
0,"I'm 5'5"" and this barely stretched to the unde..."
0,Don't waste your $
1,Tiny fit and terrible print!
1,Got it for my brother who normally wears a large.


In [34]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276775 entries, 0 to 109999
Data columns (total 1 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   sentence  276775 non-null  object
dtypes: object(1)
memory usage: 4.2+ MB


In [35]:
test_dataset['sentence2'] = test_dataset['sentence'].apply(lambda x: clean_text(x))

In [36]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

'''remove_df = pd.DataFrame({"col":[]})

def find(sentence):
  for i, sent in enumerate(sentence):
      input_ids = tokenizer.encode(sent, add_special_tokens=True)
      if len(input_ids) > 25:
        remove_df.loc[remove_df.index.max() + 1 if len(remove_df) > 0 else 0, "col"] = i

find(test_dataset.sentence)
test_dataset = test_dataset[~test_dataset.index.isin(remove_df['col'])]'''

'remove_df = pd.DataFrame({"col":[]})\n\ndef find(sentence):\n  for i, sent in enumerate(sentence):\n      input_ids = tokenizer.encode(sent, add_special_tokens=True)\n      if len(input_ids) > 25:\n        remove_df.loc[remove_df.index.max() + 1 if len(remove_df) > 0 else 0, "col"] = i\n\nfind(test_dataset.sentence)\ntest_dataset = test_dataset[~test_dataset.index.isin(remove_df[\'col\'])]'

In [37]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276775 entries, 0 to 109999
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   sentence   276775 non-null  object
 1   sentence2  276775 non-null  object
dtypes: object(2)
memory usage: 6.3+ MB


In [38]:
test_sentence = test_dataset.sentence2

In [39]:
# 모델 인스턴스 생성
model2 = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 5, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# 저장된 가중치 불러오기
model2.load_state_dict(torch.load('/content/drive/MyDrive/TM/Data/Bert1/Best_Bert1.pth'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

<All keys matched successfully>

In [40]:
test_input_ids, test_attention_masks = MyTokenize(test_sentence)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [41]:
test_tensor = TensorDataset(test_input_ids, test_attention_masks)

In [42]:
model2 = model2.to(device)

model2.eval()

test_dataloader = DataLoader(
    test_tensor,
    sampler = SequentialSampler(test_dataset),
    batch_size = 32
)

predictions = []

for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)

    with torch.no_grad():
        outputs = model2(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()

    predictions.extend(np.argmax(logits, axis=1).flatten())

test_dataset['label']  = list(predictions)

In [43]:
test_dataset.head()

Unnamed: 0_level_0,sentence,sentence2,label
column_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Would be super sexy if you are only about 4 ft...,would super sexy ft tall,1
0,"I'm 5'5"" and this barely stretched to the unde...",barely stretched underside boobs,1
0,Don't waste your $,waste,3
1,Tiny fit and terrible print!,tiny fit terrible print,1
1,Got it for my brother who normally wears a large.,got brother normally wears large,1


In [44]:
test_dataset = test_dataset[['sentence', 'label']]

In [45]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276775 entries, 0 to 109999
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   sentence  276775 non-null  object
 1   label     276775 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.3+ MB


In [46]:
test_dataset.to_csv('/content/drive/MyDrive/TM/Data/Bert1/Bert1.csv', index=False)