## Library Import

In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm

## Data Load

In [2]:
# 데이터 경로 설정
data_path = "~/movie/data/train"

# 전체 학습 데이터 불러오기
train_ratings = pd.read_csv(os.path.join(data_path, "train_ratings.csv"))

# 아이템 side information 불러오기
years = pd.read_csv(os.path.join(data_path, "years.tsv"), sep="\t")
writers = pd.read_csv(os.path.join(data_path, "writers.tsv"), sep="\t")
titles = pd.read_csv(os.path.join(data_path, "titles.tsv"), sep="\t")
genres = pd.read_csv(os.path.join(data_path, "genres.tsv"), sep="\t")
directors = pd.read_csv(os.path.join(data_path, "directors.tsv"), sep="\t")

In [3]:
# side information을 하나의 아이템 데이터프레임으로 병합
item_df = pd.merge(titles, years, on="item", how="left")
item_df = pd.merge(item_df, genres, on="item", how="left")
item_df = pd.merge(item_df, directors, on="item", how="left")
item_df = pd.merge(item_df, writers, on="item", how="left")

In [4]:
item_df.head(20)

Unnamed: 0,item,title,year,genre,director,writer
0,318,"Shawshank Redemption, The (1994)",1994.0,Crime,nm0001104,nm0000175
1,318,"Shawshank Redemption, The (1994)",1994.0,Crime,nm0001104,nm0001104
2,318,"Shawshank Redemption, The (1994)",1994.0,Drama,nm0001104,nm0000175
3,318,"Shawshank Redemption, The (1994)",1994.0,Drama,nm0001104,nm0001104
4,2571,"Matrix, The (1999)",1999.0,Action,nm0905152,nm0905152
5,2571,"Matrix, The (1999)",1999.0,Action,nm0905152,nm0905154
6,2571,"Matrix, The (1999)",1999.0,Action,nm0905154,nm0905152
7,2571,"Matrix, The (1999)",1999.0,Action,nm0905154,nm0905154
8,2571,"Matrix, The (1999)",1999.0,Sci-Fi,nm0905152,nm0905152
9,2571,"Matrix, The (1999)",1999.0,Sci-Fi,nm0905152,nm0905154


## Data Preprocessing

### 1. `year`

결측치를 `title`에 포함된 연도 정보를 활용해 대체한다.

In [5]:
# year 결측치 확인
item_df[item_df["year"].isna()]

Unnamed: 0,item,title,year,genre,director,writer
9583,6987,"Cabinet of Dr. Caligari, The (Cabinet des Dr. ...",,Crime,,nm0562346
9584,6987,"Cabinet of Dr. Caligari, The (Cabinet des Dr. ...",,Fantasy,,nm0562346
9585,6987,"Cabinet of Dr. Caligari, The (Cabinet des Dr. ...",,Horror,,nm0562346
10944,3310,"Kid, The (1921)",,Comedy,nm0000122,nm0000122
10945,3310,"Kid, The (1921)",,Drama,nm0000122,nm0000122
15109,7065,"Birth of a Nation, The (1915)",,Drama,nm0000428,nm0000428
15110,7065,"Birth of a Nation, The (1915)",,Drama,nm0000428,nm0940488
15111,7065,"Birth of a Nation, The (1915)",,War,nm0000428,nm0000428
15112,7065,"Birth of a Nation, The (1915)",,War,nm0000428,nm0940488
16055,7243,Intolerance: Love's Struggle Throughout the Ag...,,Drama,nm0000428,nm0000428


In [6]:
# title에서 괄호 안 연도 추출해 year 결측치 대체
item_df["year"] = item_df["year"].fillna(
    item_df["title"].str.extract(r"\((\d{4})\)", expand=False)  # 괄호 안 네 자리 숫자를 추출하는 정규표현식
).astype("int64")

# 결과 확인
print(item_df[item_df["year"].isna()])  # 여전히 NaN인 경우 확인

Empty DataFrame
Columns: [item, title, year, genre, director, writer]
Index: []


### 2. `title`

#### 2.1 같은 `title`, 다른 `item` 처리
같은 영화 중 다른 item 값을 갖는 데이터 중 결측치가 없는 item을 기준으로 통일시킨다.

In [7]:
delete_title = (titles["title"].value_counts() > 1).index[0]
print("Before droping the indices")
display(item_df[item_df["title"] == delete_title])

# 중복된 title을 갖는 item 제거
item_df = item_df.drop(index=[13507, 13508])
print("\nAfter drop the indices")
display(item_df[item_df["title"] == delete_title])

Before droping the indices


Unnamed: 0,item,title,year,genre,director,writer
3584,34048,War of the Worlds (2005),2005,Action,nm0000229,nm0295264
3585,34048,War of the Worlds (2005),2005,Action,nm0000229,nm0462895
3586,34048,War of the Worlds (2005),2005,Action,nm0000229,nm0920229
3587,34048,War of the Worlds (2005),2005,Adventure,nm0000229,nm0295264
3588,34048,War of the Worlds (2005),2005,Adventure,nm0000229,nm0462895
3589,34048,War of the Worlds (2005),2005,Adventure,nm0000229,nm0920229
3590,34048,War of the Worlds (2005),2005,Sci-Fi,nm0000229,nm0295264
3591,34048,War of the Worlds (2005),2005,Sci-Fi,nm0000229,nm0462895
3592,34048,War of the Worlds (2005),2005,Sci-Fi,nm0000229,nm0920229
3593,34048,War of the Worlds (2005),2005,Thriller,nm0000229,nm0295264



After drop the indices


Unnamed: 0,item,title,year,genre,director,writer
3584,34048,War of the Worlds (2005),2005,Action,nm0000229,nm0295264
3585,34048,War of the Worlds (2005),2005,Action,nm0000229,nm0462895
3586,34048,War of the Worlds (2005),2005,Action,nm0000229,nm0920229
3587,34048,War of the Worlds (2005),2005,Adventure,nm0000229,nm0295264
3588,34048,War of the Worlds (2005),2005,Adventure,nm0000229,nm0462895
3589,34048,War of the Worlds (2005),2005,Adventure,nm0000229,nm0920229
3590,34048,War of the Worlds (2005),2005,Sci-Fi,nm0000229,nm0295264
3591,34048,War of the Worlds (2005),2005,Sci-Fi,nm0000229,nm0462895
3592,34048,War of the Worlds (2005),2005,Sci-Fi,nm0000229,nm0920229
3593,34048,War of the Worlds (2005),2005,Thriller,nm0000229,nm0295264


In [8]:
# train_ratings에서 item 값을 변경하고자 하는 인덱스 목록 추출
idx = train_ratings[(train_ratings["item"] == 64997)].index

# 원하는 item 값으로 변경
train_ratings.loc[idx, "item"] = 34048

In [9]:
# 결측치 대체가 잘 됐는지 확인
train_ratings[(train_ratings["item"] == 64997)]

Unnamed: 0,user,item,time


#### 2.2 `title` 재구성

현재 `title`은 _**"영문 제목 (a.k.a. 별칭) (원어 제목) (연도)"**_ 순으로 구성되어 있다. 정규표현식을 활용하여 다음 순서대로 `title`을 재구성한다.

1. 따옴표(”, ‘) 제거
2. 영문 제목만 선택
3. 관사 위치 재조정: "~, The"를 "The ~"로 변경
4. 특수문자 삭제
5. 소문자로 통일

In [10]:
def preprocess_title(title):
    # 1. 따옴표(”, ‘) 제거
    title = re.sub(r'^[\'"](.*)[\'"]$', r'\1', title)
    
    # 2. 영문 제목만 추출
    title = re.match(r'^[^(]+', title).group().strip() if re.match(r'^[^(]+', title) else title
    
    # 3. "~, The", "~, A", "~, An" 형태를 "The ~", "A ~", "An ~"으로 변경
    title = re.sub(r'^(.*),\s(The|A|An)$', r'\2 \1', title)
    
    # 4. 특수문자 제거
    title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
    
    # 5. 소문자로 변환
    title = title.lower()
    
    return title

In [11]:
item_df["title"] = item_df["title"].apply(preprocess_title)

### 3. `director`

In [12]:
item_director_counts = directors.groupby('item')['director'].nunique().reset_index()
item_director_counts.columns = ['item', 'director_count']

director_counts = item_director_counts.groupby('director_count').count().reset_index()
director_counts.columns = ['director_count', 'count']
director_counts["count"].sum()

5503

### 4. `writer`

### 5. `genre`

In [13]:
# 함수 정의: 피벗별로 상위 k개의 레벨만 남기기
def filter_top_k_by_count(df, sel_col, pivot_col, top_k, ascending=False):
    # 1. 레벨별 전체 등장 빈도 계산
    col_count = df[sel_col].value_counts().reset_index()
    col_count.columns = [sel_col, "count"]
    
    # 2. 원본 데이터프레임에 레벨 count 추가
    df = df.merge(col_count, on=sel_col)
    
    # 3. 각 피벗별로 상위 N개의 레벨 남기기
    filtered_df = df.groupby(pivot_col).apply(
        (lambda x: x.nsmallest(top_k, "count")) if ascending else (lambda x: x.nlargest(top_k, "count"))
    ).reset_index(drop=True)
    
    # 4. count 열 제거 후 결과 반환
    filtered_df = filtered_df.drop(columns=["count"])
    
    return filtered_df

In [14]:
# 함수 실행
genres = filter_top_k_by_count(genres, sel_col="genre", pivot_col="item", top_k=4)
genres

  filtered_df = df.groupby(pivot_col).apply(


Unnamed: 0,item,genre
0,1,Comedy
1,1,Adventure
2,1,Fantasy
3,1,Children
4,2,Adventure
...,...,...
15609,119141,Action
15610,119145,Comedy
15611,119145,Action
15612,119145,Crime


In [15]:
# 함수 정의: 멀티-핫-인코딩 하기
def multi_hot_encoding(df: pd.DataFrame,
                       label_col: str,
                       pivot_col: str
                       ) -> pd.DataFrame:
    """
    범주형 데이터에서 여러 개의 선택 가능한 값을 이진 벡터(binary vector)로 변환합니다.

    Args:
        df (pd.DataFrame): pivot_col과 label_col을 column으로 갖는 데이터프레임
        label_col (str): 데이터프레임에서 멀티 핫 인코딩을 적용할 범주형 변수명
        pivot_col (str): 데이터프레임에서 그룹화할 기준이 되는 변수명

    Returns:
        pd.DataFrame: 멀티-핫-인코딩이 완료된 데이터프레임 반환.
    """

    # 1. pivot_col별 label_col을 리스트로 묶기
    grouped_df = df.groupby(pivot_col)[label_col].apply(lambda x: list(x)).reset_index()

    # 2. MultiLabelBinarizer를 사용하여 멀티 핫 인코딩 수행
    mlb = MultiLabelBinarizer()
    multi_hot_encoded = mlb.fit_transform(grouped_df[label_col])

    # 3. 결과를 데이터프레임으로 변환
    multi_hot_df = pd.DataFrame(multi_hot_encoded, columns=mlb.classes_)

    # 4. 원본 데이터프레임과 결합
    result_df = pd.concat([grouped_df[pivot_col], multi_hot_df], axis=1)

    return result_df

In [16]:
genres_mhe = multi_hot_encoding(genres, label_col="genre", pivot_col="item")
genres_mhe

Unnamed: 0,item,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6802,118700,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6803,118900,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6804,118997,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0
6805,119141,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
# 그룹화 및 병합
grouped_df = item_df.groupby(['item', 'title', 'year']).agg({
    'genre': lambda x: list(x.unique()),
    'director': lambda x: list(x.unique()),
    'writer': lambda x: list(x.unique())
}).reset_index()

## Feature Engineering

In [18]:
# train_ratings와 item_df를 최종 병합
train_df = pd.merge(train_ratings, item_df, on="item", how="left")

### 파생변수 생성

In [19]:
# 함수 정의: num_negative만큼 negative_sampling하기
def negative_sampling(df:pd.DataFrame,
                      user_col: str,
                      item_col: str,
                      num_negative: float
                      ) -> pd.DataFrame:
    """
    _summary_

    Args:
        df (pd.DataFrame): user_col과 item_col을 column으로 갖는 데이터프레임
        user_col (str): _description_
        item_col (str): _description_
        num_negative (float): _description_

    Returns:
        pd.DataFrame: negative_sampling
    """

    df['review'] = 1
    user_group_dfs = list(df.groupby(user_col)[item_col])
    first_row = True
    user_neg_dfs = pd.DataFrame()
    items = set(df.loc[:, item_col])

    for u, u_items in tqdm(user_group_dfs):
        u_items = set(u_items)
        i_user_neg_item = np.random.choice(list(items - u_items), num_negative, replace=False)
        i_user_neg_df = pd.DataFrame({user_col: [u]*num_negative, item_col: i_user_neg_item, 'review': [0]*num_negative})
        
        if first_row == True:
            user_neg_dfs = i_user_neg_df
            first_row = False
        
        else:
            user_neg_dfs = pd.concat([user_neg_dfs, i_user_neg_df], axis = 0, sort=False)

    raw_rating_df = pd.concat([df, user_neg_dfs], axis = 0, sort=False) 
    
    return raw_rating_df

In [20]:
def pivot_count(df: pd.DataFrame,
                pivot_col: str,
                col_name: str,
                ) -> pd.DataFrame:
    """
    _summary_

    Args:
        df (pd.DataFrame): _description_
        pivot_col (str): _description_
        col_name (str): _description_

    Returns:
        pd.DataFrame: _description_
    """

    if 'review' in df.columns:
        positive_df =  df[df["review"]==1]
        pivot_count_df = positive_df[pivot_col].value_counts()
    
    else:
        pivot_count_df = df[pivot_col].value_counts()

    df[col_name] =  df[pivot_col].map(pivot_count_df)
    
    return df