<a href="https://colab.research.google.com/github/Yoshikawawawa/GoogleColab/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# pip

In [None]:
! pip install transformers datasets
! pip install fugashi ipadic
! pip install sentencepiece
#クローン化　
!git clone https://github.com/ids-cv/wrime.git

# データ確認・整理

In [None]:
import pandas as pd
df_wrime = pd.read_table("/content/wrime/wrime-ver1.tsv")#ver1を今回は使用
df_wrime.info()

In [None]:
#客観感情を配列
emotion_names = ['Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']
df_wrime['readers_emotion_intensities'] = df_wrime.apply(lambda x: [x['Avg. Readers_' + name] for name in emotion_names], axis=1)
#感情強度2以上を採用
is_target = df_wrime['readers_emotion_intensities'].map(lambda x: max(x) >= 2)
df_wrime_target = df_wrime[is_target]

In [None]:
df_wrime['readers_emotion_intensities']

# 訓練・テストに分割

In [None]:
df_groups = df_wrime_target.groupby('Train/Dev/Test')
df_train = df_groups.get_group('train')
df_test = pd.concat([df_groups.get_group('dev'), df_groups.get_group('test')])
#訓練データ・テストデータ確認
print('train :', len(df_train)) 
print('test :', len(df_test))

# 事前学習済みモデル読み込み・データ整形

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#東北大学の乾研究室のモデルを使用
#トークナイザとモデルを読み込む
MODEL = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=8)

In [None]:
from datasets import Dataset
import numpy as np
# 1. Transformers用のデータセット形式に変換
# pandas.DataFrame -> datasets.Dataset
target_columns = ['Sentence', 'readers_emotion_intensities']
train_dataset = Dataset.from_pandas(df_train[target_columns])
test_dataset = Dataset.from_pandas(df_test[target_columns])
# 2. Tokenizerを適用（モデル入力のための前処理）
def tokenize_function(batch):
    """Tokenizerを適用 （感情強度の正規化も同時に実施する）."""
    tokenized_batch = tokenizer(batch['Sentence'], truncation=True, padding='max_length')
    tokenized_batch['labels'] = [x / np.sum(x) for x in batch['readers_emotion_intensities']]  # 総和=1に正規化
    return tokenized_batch

train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
test_tokenized_dataset = test_dataset.map(tokenize_function, batched=True)

# 訓練開始

In [None]:
from transformers import TrainingArguments, Trainer
from datasets import load_metric
from transformers import AutoTokenizer

# 評価指標を定義
# https://huggingface.co/docs/transformers/training
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    label_ids = np.argmax(labels, axis=-1)
    return metric.compute(predictions=predictions, references=label_ids)

# 訓練時の設定
# https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=8,
    num_train_epochs=1.0,
    evaluation_strategy="steps",eval_steps=200)  # 200ステップ毎にテストデータで評価する

# Trainerを生成
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    compute_metrics=compute_metrics,
)

# 訓練を実行
trainer.train()

In [None]:
#学習したものをsave
tokenizer.save_pretrained("/content/drive/MyDrive")#保存したい場所を指定
model.save_pretrained("/content/drive/MyDrive")#保存したい場所を指定

# 文章から推論

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# ソフトマックス関数
# https://www.delftstack.com/ja/howto/numpy/numpy-softmax/
def np_softmax(x):
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x

def analyze_emotion(text, show_fig=True):
    # 推論モードを有効化
    model.eval()

    # 入力データ変換 + 推論
    tokens = tokenizer(text, truncation=True, return_tensors="pt")
    tokens.to(model.device)
    preds = model(**tokens)
    prob = np_softmax(preds.logits.cpu().detach().numpy()[0])
    out_dict = {n: p for n, p in zip(emotion_names, prob)}

    # 棒グラフを描画
    if show_fig:
        sns.set()
        plt.figure(figsize=(8, 3))
        plt.ylim(0, 1.0)
        df = pd.DataFrame(out_dict.items(), columns=['name', 'prob'])
        sns.barplot(x='name', y='prob', data=df)
        plt.title('入力文 : ' + text)
    else:
        print(out_dict)
#テキスト入力
analyze_emotion('')