#データクレンジング

##ライブラリの準備とAIモデルの読み込み
まずは、データを扱うライブラリと、文章を理解するためのAIモデル（BERT）を準備する。

In [2]:
import pandas as pd             # データを表形式で扱うためのライブラリ
import re                        # 文字列のパターン検索や置換を行うためのライブラリ
from tqdm import tqdm           # 処理の進み具合をプログレスバーで表示するライブラリ
import torch                    # AI（ディープラーニング）計算用のライブラリ
from transformers import BertTokenizer, BertModel # 文章を数値に変換するAIモデル

# 文を単語に分解する「トークナイザー」の読み込み
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', model_max_length=512)
# 文の特徴を抽出する「AIモデル本体」の読み込み
model = BertModel.from_pretrained('bert-base-uncased')

# 計算を高速なGPU（cuda）で行うか、通常のCPUで行うかを自動判別
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# AIモデルを計算デバイス（GPUかCPU）に転送
model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

##データの読み込みと結合
バラバラになっている「問題文」と「正解ラベル」のファイルを一つにまとめる。（テストデータのみ）

In [3]:
# csvファイルからデータを読み込む
df_label = pd.read_csv("test_labels.csv")  # 正解（ラベル）データ
df_problem = pd.read_csv("test.csv")       # テスト用の問題文データ

# "id"を基準にして2つのデータを合体させる（インナーマージ）
df_test = pd.merge(df_label, df_problem, how="inner", on="id")

# 合体させた新しいデータを保存（indexは不要なのでFalse）
df_test.to_csv("test_new.csv", index=False)

##テキストクリーニング用の辞書と関数の定義
AIが文章を理解しやすいように、短縮形（"don't"など）を元の形（"do not"）に直したり、不要な記号を消したりする「掃除」のルールを作る。

In [4]:
# 短縮形を正式な形に直すための変換辞書
cList = {
  "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have",
  "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have",    "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
  "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would",
  "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have", "he's": "he is",
  "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
  "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have",
  "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it had", "it'd've": "it would have",
  "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us",
  "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
  "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
  "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
  "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
  "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
  "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
  "she'll've": "she will have", "she's": "she is", "should've": "should have",
  "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
  "so's": "so is", "that'd": "that would", "that'd've": "that would have", "that's": "that is",
  "there'd": "there had", "there'd've": "there would have", "there's": "there is",
  "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
  "they'll've": "they will have", "they're": "they are", "they've": "they have",
  "to've": "to have", "wasn't": "was not", "we'd": "we had", "we'd've": "we would have",
  "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
  "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
  "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
  "when've": "when have", "where'd": "where did", "where's": "where is",
  "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is",
  "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
  "won't": "will not", "won't've": "will not have", "would've": "would have",
  "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
  "y'alls": "you alls", "y'all'd": "you all would", "y'all'd've": "you all would have",
  "y'all're": "you all are", "y'all've": "you all have", "you'd": "you had",
  "you'd've": "you would have", "you'll": "you you will", "you'll've": "you you will have",
  "you're": "you are", "you've": "you have"
}

def normalize_text(text):
    """文章をきれいに整える関数"""
    if text is None: return ""
    text = str(text)

    # アルファベット、数字、一部の記号以外を削除
    text = re.sub(r'[^a-zA-Z0-9\s\'.,?!]', '', text)
    # 連続する記号（!!!など）を一つにまとめる
    text = re.sub(r'([.,?!])\1+', r'\1', text)
    # 無駄な改行や空白を整理
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # 単語ごとに分割してリスト化
    words = re.findall(r"\b\w+[']?\w*\b|[^\s\w]+", text)
    expanded_words = []

    for word in words:
        lower_word = word.lower()
        if lower_word in cList:
            # 短縮形があれば辞書を使って戻す
            expanded = cList[lower_word]
            # 元が「Don't」など大文字始まりなら、変換後も大文字にする
            if word[0].isupper():
                expanded = expanded.capitalize()
            expanded_words.append(expanded)
        else:
            expanded_words.append(word)

    # バラバラにした単語を一つの文章に戻す
    return " ".join(expanded_words)

##AIが処理できる長さに調整
RoBERTaは、一度に処理できる情報の長さが最大512トークンと決まっている。そのため、長すぎる文章を除外する。

In [7]:
def divide(file):
    """ファイルを読み込み、長さを調整してからAIでベクトル化する関数"""
    # 1. データの読み込みとテキストの掃除
    df_workbook = pd.read_csv(file + '.csv')
    df_workbook['cleaned_text'] = df_workbook['comment_text'].apply(normalize_text)

    # 2. 文章の長さをチェック（トークン化してカウント）
    # AIが一度に読めるのは512文字分までなので、その長さを測る
    df_workbook['token_len'] = df_workbook['cleaned_text'].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))

    # 3. 512トークンを超える長い文章は、エラーの原因になるため除外する
    # 「512未満」のデータだけを残して、番号（Index）を振り直す
    df_workbook = df_workbook[df_workbook['token_len'] < 512].reset_index(drop=True)

    # 4. 「有害(toxic=1)」と「安全(toxic=0)」にデータを仕分けて保存
    df_safe = df_workbook[df_workbook['toxic'] == 0].reset_index(drop=True)
    df_out = df_workbook[df_workbook['toxic'] == 1].reset_index(drop=True)
    if file == "train":
      df_safe.to_csv(file+"_safe.csv", index=False)
      df_out.to_csv(file+"_out.csv", index=False)
    elif file == "test_new":
      df_safe.to_csv("test_safe.csv", index=False)
      df_out.to_csv("test_out.csv", index=False)

##処理の実行
最後に、準備した関数を使って「テストデータ」と「学習データ」の両方を処理する。

In [6]:
# 先ほど作った "test_new.csv" を処理
divide("test_new")

# もともと持っている "train.csv" を処理
divide("train")

Token indices sequence length is longer than the specified maximum sequence length for this model (867 > 512). Running this sequence through the model will result in indexing errors
