# 나만의 대화형 인공지능 활용법
### 학습용 데이터 만들기
#### - 한동대학교 AI융합교육원 김경외 교수

### 1. .csv 데이터 활용하기 (Case 1)

In [4]:
# 데이터를 불러옵니다.
# 데이터 출처: https://github.com/mdsohaib/BBC-News-Classification/blob/master/bbc-text.csv
import pandas as pd

bbc_text_df = pd.read_csv('training_data/bbc-text.csv')

bbc_text_df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [5]:
# 다양한 종류의 instruction로 구성된 리스트를 생성합니다.
class_instructions = [
    "Determine the category of this article.",
    "Identify the genre of the following news.",
    "Classify the type of this piece.",
    "What's the main theme of this article?",
    "Assign a category to the given text.",
    "From the content below, deduce the news category.",
    "Categorize the below news piece.",
    "Determine the main subject of the following text.",
    "Designate a genre for this news.",
    "Based on the content, decide the article type."
]

In [8]:
# 알파카 포맷으로 데이터 변환합니다.
import random

bbc_class_alpaca = []
for index, row in bbc_text_df.iterrows():
    entry = {
        "instruction": random.choice(class_instructions),
        "input": row['text'],
        "output": row['category']
    }
    bbc_class_alpaca.append(entry)

# Preview the first few entries
bbc_class_alpaca[:2]

[{'instruction': 'Identify the genre of the following news.',
  'input': 'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology a

In [10]:
# .json 파일로 저장합니다.
import json
with open("training_data/bbc_class_alpaca.json", "w") as f:
   json.dump(bbc_class_alpaca, f)

### 2. .csv 데이터 활용하기 (Case 2)

In [11]:
# 데이터를 불러옵니다.
# 데이터 출처: https://www.kaggle.com/datasets/aisolutions353/btc-tweets-sentiment

import pandas as pd

tweet_btc_df = pd.read_csv("training_data/BTC_Tweets_Updated.csv")
tweet_btc_df.head(1)

Unnamed: 0,id,Date,Tweet,Screen_name,Source,Link,Sentiment,sent_score,New_Sentiment_Score,New_Sentiment_State,BERT Labels
0,0,Fri Mar 23 00:40:32 +0000 2018,"RT @ALXTOKEN: Paul Krugman, Nobel Luddite. I h...",myresumerocket,[],"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['neutral'],0.0,0.0,0.0,1


In [12]:
# 타겟 변수값을 변환합니다.
def sentiment_score_to_name(score: float):
    if score > 0:
        return "Positive"
    elif score < 0:
        return "Negative"
    return "Neutral"

In [13]:
# 다양한 종류의 instruction로 구성된 리스트를 생성합니다.
tweet_instructions = [
    "Could you assess the emotion of this tweet?",
    "Can you determine the mood conveyed in this tweet?",
    "Please evaluate the sentiment expressed in this tweet.",
    "I'd appreciate it if you could gauge the sentiment of this tweet.",
    "Let me know the feeling this tweet is projecting, please.",
    "Help me understand the emotional tone of this tweet.",
    "Examine the tweet and inform me of its sentiment.",
    "I'm curious about the sentiment behind this tweet; can you analyze it?",
    "What's the emotional undertone of this tweet? Please check.",
    "Could you delve into this tweet and tell me its sentiment?"
]

In [14]:
# 알파카 포맷으로 데이터 변환합니다.
import random 

tweet_btc_alpaca = [
    {
        "instruction": random.choice(tweet_instructions),
        "input": tweet_btc_df["Tweet"],
        "output": sentiment_score_to_name(tweet_btc_df["sent_score"])
    }
    for tweet_btc_df in tweet_btc_df.to_dict(orient="records")
]
 
tweet_btc_alpaca[0]

{'instruction': "What's the emotional undertone of this tweet? Please check.",
 'input': "RT @ALXTOKEN: Paul Krugman, Nobel Luddite. I had to tweak the nose of this Bitcoin enemy. He says such foolish things. Here's the link: htt…",
 'output': 'Neutral'}

In [15]:
# .json 파일로 저장합니다.
import json
with open("training_data/tweet_btc_alpaca.json", "w") as f:
   json.dump(tweet_btc_alpaca, f)

### 3. Hugging Face 데이터 활용하기

In [1]:
# datasets라는 라이브러리의 load_dataset이라는 함수를 사용하여 데이터셋을 불러옵니다.
# 데이터 출처: https://huggingface.co/datasets/tatsu-lab/alpaca/viewer/default/train?row=0
from datasets import load_dataset
alpaca_dataset = load_dataset("tatsu-lab/alpaca")

Found cached dataset parquet (C:/Users/user/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
# dataset이 잘 불러와졌는지 확인합니다.
alpaca_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})

In [3]:
# dataset을 데이터프레임 형식으로 변환해서 형태를 확인합니다.
import pandas as pd
alpaca_dataset_df = pd.DataFrame(alpaca_dataset['train'])
alpaca_dataset_df.head(1)

Unnamed: 0,instruction,input,output,text
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...,Below is an instruction that describes a task....


In [8]:
# 알파카 포맷으로 데이터 변환합니다.
alpaca_instruct = [
    {
        "instruction": alpaca_dataset_df['instruction'],
        "input": alpaca_dataset_df["input"],
        "output": alpaca_dataset_df["output"]
    }
    for alpaca_dataset_df in alpaca_dataset_df.to_dict(orient="records")
]
alpaca_instruct[0]

{'instruction': 'Give three tips for staying healthy.',
 'input': '',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}

In [6]:
# .json 파일로 저장합니다.
import json
with open("training_data/alpaca_instruct.json", "w") as f:
   json.dump(alpaca_instruct, f)