# Data EDA (Exploratory Data Analysis)

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_columns', None)

## Load Data

In [None]:
# TODO: Train Data 경로 입력
dataset = pd.read_csv('../data/train.csv')

# Flatten the JSON dataset
records = []
for _, row in dataset.iterrows():
    problems = literal_eval(row['problems'])
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': problems['question'],
        'choices': problems['choices'],
        'answer': problems.get('answer', None),
        'question_plus': problems.get('question_plus', None),
    }
    records.append(record)

# Convert to DataFrame
df = pd.DataFrame(records)

In [None]:
df.head()

## Missing Values

In [None]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

## Basic Information

In [None]:
print("\nDataset Information:")
df.info()

## EDA on Question and Choices

In [None]:
# Combine 'question' and 'question_plus' if available
df['question_plus'] = df['question_plus'].fillna('')
df['full_question'] = df.apply(
    lambda x: x['question'] + ' ' + x['question_plus'] if x['question_plus'] else x['question'],
    axis=1
)

# Calculate the length of each question
df['question_length'] = df['full_question'].apply(len)

## Question Length Distribution

In [None]:
plt.figure(figsize=(5, 3))
plt.hist(df['question_length'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Distribution of Question Lengths')
plt.xlabel('Question Length')
plt.ylabel('Frequency')
plt.show()

## Feature Engineering using TF-IDF

- TF-IDF 참고 링크: https://ko.wikipedia.org/wiki/Tf-idf

In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

In [None]:
# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['full_question'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [None]:
# Display the TF-IDF features
print("\nTF-IDF Features:")
display(tfidf_df.head(20))