# Generation for NLP Baseline Code

```
python3.10 -m venv --system-site-packages /data/ephemeral/home/py310
source /data/ephemeral/home/py310/bin/activate
pip install --upgrade pip
```
위에 커맨드를 사용하여 가상환경을 만들고 IDE의 커널을 생성한 가상환경으로 변경해주세요.

## Install Packages

In [None]:
# !pip install --no-cache-dir torch==2.9.1+cu128 --index-url https://download.pytorch.org/whl/cu128
# !pip install --no-cache-dir -r requirements.txt

## Import Necessary Libraries

In [None]:
import torch
import transformers
from ast import literal_eval
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
import json
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import evaluate
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from peft import AutoPeftModelForCausalLM, LoraConfig

pd.set_option('display.max_columns', 1000)

In [None]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

set_seed(42) # magic number :)

## EDA

### Load Data

In [None]:
# Load the train dataset
# TODO Train Data 경로 입력
dataset = pd.read_csv('/data/ephemeral/pro-nlp-generationfornlp-nlp-13/data/train.csv') 

# Flatten the JSON dataset
records = []
for _, row in dataset.iterrows():
    problems = literal_eval(row['problems'])
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': problems['question'],
        'choices': problems['choices'],
        'answer': problems.get('answer', None),
        "question_plus": problems.get('question_plus', None),
    }
    # Include 'question_plus' if it exists
    if 'question_plus' in problems:
        record['question_plus'] = problems['question_plus']
    records.append(record)
        
# Convert to DataFrame
df = pd.DataFrame(records)

### Print missing values

In [None]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

### Basic information about the dataset

In [None]:
print("\nDataset Information:")
df.info()

### EDA on 'question' and 'choices'

In [None]:
# Combine 'question' and 'question_plus' if available
df['question_plus'] = df['question_plus'].fillna('')
df['full_question'] = df.apply(lambda x: x['question'] + ' ' + x['question_plus'] if x['question_plus'] else x['question'], axis=1)

# Calculate the length of each question
df['question_length'] = df['full_question'].apply(len)
df['paragraph_length'] = df['paragraph'].apply(len)

### Question Length Distribution

In [None]:
plt.figure(figsize=(5, 3))
plt.hist(df['question_length'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Distribution of Question Lengths')
plt.xlabel('Question Length')
plt.ylabel('Frequency')
plt.show()

### Paragraph Length Distribution

In [None]:
plt.figure(figsize=(5, 3))
plt.hist(df['paragraph_length'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Distribution of Paragraph Lengths')
plt.xlabel('Paragraph Length')
plt.ylabel('Frequency')
plt.show()

# 질문 길이와 본문 길이 상관성 분석

In [None]:
def plot_scatter_comparison(df):
    """질문 길이 vs paragraph 길이 산점도"""
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # 기본 산점도
    axes[0].scatter(df['question_length'], df['paragraph_length'], 
                   alpha=0.5, s=10)
    axes[0].set_xlabel('Question Length (characters)')
    axes[0].set_ylabel('Paragraph Length (characters)')
    axes[0].set_title('Question Length vs Paragraph Length')
    axes[0].grid(alpha=0.3)
    
    # 상관관계 표시
    correlation = df['question_length'].corr(df['paragraph_length'])
    axes[0].text(0.05, 0.95, f'Correlation: {correlation:.3f}',
                transform=axes[0].transAxes, 
                verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    # 밀도 산점도 (hexbin)
    axes[1].hexbin(df['question_length'], df['paragraph_length'], 
                   gridsize=30, cmap='YlOrRd')
    axes[1].set_xlabel('Question Length (characters)')
    axes[1].set_ylabel('Paragraph Length (characters)')
    axes[1].set_title('Density Plot (Hexbin)')
    plt.colorbar(axes[1].collections[0], ax=axes[1], label='Count')
    
    plt.tight_layout()
    plt.savefig('scatter_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

plot_scatter_comparison(df)

## 심층 EDA Frequency of Words in Question

In [None]:
# !pip install konlpy
# !pip install wordcloud 

In [None]:
from konlpy.tag import Okt
from collections import Counter

def Q_korean_word_freq(df):
    okt = Okt()
    stopwords = {
        '것', '수', '등', '및', '제', '때', '대한',
        '옳은', '않은', '틀린', '설명', '인가', '무엇',
        '다음', '보기', '문제', '답', '번', '있는' ,'가장',
    }
    # 모든 질문 합치기
    all_text = ' '.join(df['question'].astype(str))
    
    # 형태소 분석 (명사만 추출)
    nouns = okt.nouns(all_text)
    
    # 한 글자 단어 제거
    nouns = [word for word in nouns if len(word) > 1 and word not in stopwords]
    
    # 빈도수 계산
    word_freq = Counter(nouns)
    
    # 상위 30개
    print("=== 상위 30개 명사 ===")
    for word, count in word_freq.most_common(30):
        print(f"{word}: {count}번")
    
    return word_freq

Q_freq = Q_korean_word_freq(df)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_wordcloud_from_freq(word_freq, title='단어 빈도 워드클라우드'):
    """Counter 객체로 워드클라우드 생성"""
    
    # 한글 폰트 설정 (환경에 맞게 선택)
    font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'  # Linux
    # font_path = 'C:/Windows/Fonts/malgun.ttf'  # Windows
    # font_path = '/System/Library/Fonts/AppleGothic.ttf'  # Mac
    
    # 워드클라우드 생성
    wordcloud = WordCloud(
        font_path=font_path,
        width=1200,
        height=600,
        background_color='white',
        max_words=100,
        relative_scaling=0.3,
        colormap='viridis'
    ).generate_from_frequencies(word_freq)  # ← Counter 객체 직접 사용!
    
    # 시각화
    plt.figure(figsize=(15, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.show()

# 사용
create_wordcloud_from_freq(Q_freq)

## 심층 EDA Frequency of Words in Paragraph

In [None]:
from konlpy.tag import Okt
from collections import Counter

def P_korean_word_freq(df):
    okt = Okt()
    stopwords = {
        '것', '수', '등', '및', '제', '때', '대한',
        '옳은', '않은', '틀린', '설명', '인가', '무엇',
        '다음', '보기', '문제', '답', '번', '있는' ,'가장',
    }
    # 모든 질문 합치기
    all_text = ' '.join(df['paragraph'].astype(str))
    
    # 형태소 분석 (명사만 추출)
    nouns = okt.nouns(all_text)
    
    # 한 글자 단어 제거
    nouns = [word for word in nouns if len(word) > 1 and word not in stopwords]
    
    # 빈도수 계산
    word_freq = Counter(nouns)
    
    # 상위 30개
    print("=== 상위 30개 명사 ===")
    for word, count in word_freq.most_common(30):
        print(f"{word}: {count}번")
    
    return word_freq

P_freq = P_korean_word_freq(df)

In [None]:
create_wordcloud_from_freq(P_freq)