In [1]:
import yaml
import os

# 현재 파일 기준으로 config 폴더 경로
config_path = "/data/ephemeral/home/min/pro-recsys-bookratingprediction-recsys-04/config/config_baseline.yaml"



# YAML 파일 읽기
with open(config_path, "r", encoding="utf-8") as file:
    config = yaml.safe_load(file)


In [2]:
import numpy as np
import pandas as pd
import regex
import re
from PIL import Image
import torch
from torchvision.transforms import v2
from tqdm import tqdm

In [3]:
# config에서 data 폴더 경로 가져오기
data_dir = "/data/ephemeral/home/data"  # 예: "config/data"

# users.csv 경로 만들기
user_data_path = os.path.join(data_dir, "users.csv")
user_df = pd.read_csv(user_data_path)

book_data_path = os.path.join(data_dir, "books.csv")
book_df = pd.read_csv(book_data_path)

train_rating_data_path = os.path.join(data_dir, "train_ratings.csv")
train_rating_df = pd.read_csv(train_rating_data_path)

In [4]:
print(user_df.head())

   user_id                            location   age
0        8            timmins, ontario, canada   NaN
1    11400             ottawa, ontario, canada  49.0
2    11676                       n/a, n/a, n/a   NaN
3    67544            toronto, ontario, canada  30.0
4    85526  victoria, british columbia, canada  36.0


In [5]:
user_df.columns

Index(['user_id', 'location', 'age'], dtype='object')

In [6]:
from ydata_profiling import ProfileReport
import os

# 분석할 데이터프레임과 저장할 HTML 파일 이름 매핑
dfs = {
    "user_df": user_df,
    "book_df": book_df,
    "train_rating_df": train_rating_df
}

for name, df in dfs.items():
    # 1. 프로파일링 리포트 생성
    profile = ProfileReport(df, title=f"{name} Profiling Report", explorative=True)
    
    # 2. HTML 파일로 저장
    report_path = f"{name}_profile_report.html"
    profile.to_file(report_path)
    
    # 3. 절대 경로 출력
    abs_path = os.path.abspath(report_path)
    print(f"{name} report saved at: {abs_path}")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00, 10.13it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

user_df report saved at: /data/ephemeral/home/min/pro-recsys-bookratingprediction-recsys-04/eda/min/user_df_profile_report.html


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:14<00:00,  1.47s/it]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

book_df report saved at: /data/ephemeral/home/min/pro-recsys-bookratingprediction-recsys-04/eda/min/book_df_profile_report.html


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:01<00:00,  1.96it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

train_rating_df report saved at: /data/ephemeral/home/min/pro-recsys-bookratingprediction-recsys-04/eda/min/train_rating_df_profile_report.html


cd /home/min/pro-recsys-bookratingprediction-recsys-04/eda/min
python3 -m http.server 8000

이걸 shell 에 실행하면 3개 파일 모드 볼 수 있음.

## TMP

In [7]:
import numpy as np
import pandas as pd
import regex
import re
from PIL import Image
import torch
from torchvision.transforms import v2
from tqdm import tqdm

# -------------------------------
# 1. location, age, category, publication_range 처리
# -------------------------------
def process_user_features(user_df):
    df = user_df.copy()
    
    # location split
    def split_location(x):
        if pd.isna(x): return [np.nan, np.nan, np.nan]
        res = x.split(',')
        res = [i.strip().lower() for i in res]
        res = [regex.sub(r'[^a-zA-Z/ ]', '', i) for i in res]
        res = [i if i not in ['n/a', ''] else np.nan for i in res]
        res.reverse()
        for i in range(len(res)-1, 0, -1):
            if (res[i] in res[:i]) and (not pd.isna(res[i])):
                res.pop(i)
        while len(res) < 3:
            res.append(np.nan)
        return res[:3]

    df['location_list'] = df['location'].apply(split_location)
    df['location_country'] = df['location_list'].apply(lambda x: x[0])
    df['location_state'] = df['location_list'].apply(lambda x: x[1])
    df['location_city'] = df['location_list'].apply(lambda x: x[2])

    # age, age_range
    df['age'] = df['age'].fillna(df['age'].mode()[0])
    df['age_range'] = df['age'].apply(lambda x: x // 10 * 10)
    
    return df.drop(columns=['location_list'], errors='ignore')

def process_book_features(book_df):
    df = book_df.copy()
    
    # category
    def str2list(x):
        if pd.isna(x): return np.nan
        return x[1:-1].split(', ')[0]
    df['category'] = df['category'].apply(str2list)
    
    # publication range
    df['publication_range'] = df['year_of_publication'].apply(lambda x: x // 10 * 10)
    
    # language fillna
    df['language'] = df['language'].fillna(df['language'].mode()[0])
    
    return df

# -------------------------------
# 2. 이미지 벡터화
# -------------------------------
def image_vector(path, img_size=64):
    img = Image.open(path)
    transform = v2.Compose([
        v2.Lambda(lambda x: x.convert('RGB') if x.mode != 'RGB' else x),
        v2.Resize((img_size, img_size)),
        v2.ToImage(),
        v2.ToDtype(torch.float32, scale=True),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return transform(img).numpy()

def add_img_vector(book_df, img_size=64):
    df = book_df.copy()
    img_vecs = []
    for path in tqdm(df['img_path'].apply(lambda x: f'data/{x}')):
        try:
            img_vec = image_vector(path, img_size)
        except:
            img_vec = np.zeros((3, img_size, img_size))
        img_vecs.append(img_vec)
    df['img_vector'] = img_vecs
    return df

# -------------------------------
# 3. 텍스트 벡터화 (summary, user summary merge)
# -------------------------------
def text_preprocessing(summary):
    summary = re.sub("[^0-9a-zA-Z.,!?]", " ", summary)
    summary = re.sub("\s+", " ", summary)
    return summary

def process_text_features(train_df, user_df, book_df, book_summary_vectors=None, user_summary_vectors=None):
    books_ = book_df.copy()
    users_ = user_df.copy()
    
    nan_value = 'None'
    books_['summary'] = books_['summary'].fillna(nan_value).apply(text_preprocessing)
    books_['summary_length'] = books_['summary'].apply(len)
    books_['review_count'] = books_['isbn'].map(train_df['isbn'].value_counts())
    
    users_['books_read'] = users_['user_id'].map(train_df.groupby('user_id')['isbn'].apply(list))
    
    if book_summary_vectors is not None:
        books_['book_summary_vector'] = book_summary_vectors
    if user_summary_vectors is not None:
        users_['user_summary_merge_vector'] = user_summary_vectors
    
    return users_, books_

# -------------------------------
# 4. 통합 데이터프레임 생성
# -------------------------------
def create_full_df(user_df, book_df, train_df):
    user_df_proc = process_user_features(user_df)
    book_df_proc = process_book_features(book_df)
    
    # merge train df with user/book features
    df = train_df.merge(user_df_proc, on='user_id', how='left')\
                 .merge(book_df_proc, on='isbn', how='left')
    return df, user_df_proc, book_df_proc

# -------------------------------
# 5. 사용 예시
# -------------------------------
full_df, user_df_proc, book_df_proc = create_full_df(user_df, book_df, train_rating_df)
import ydata_profiling
profile = ydata_profiling.ProfileReport(full_df)
profile.to_file("eda_full_df.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:16<00:00,  1.13it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
book_df["category"].head(100)

0                ['Actresses']
1                ['1940-1949']
2                  ['Medical']
3                  ['Fiction']
4                  ['History']
                ...           
95                 ['Fiction']
96    ['Business & Economics']
97    ['Business & Economics']
98                 ['Fiction']
99                 ['Fiction']
Name: category, Length: 100, dtype: object