In [1]:
from utils import Dataloader
import pandas as pd
import numpy as np
import os

In [2]:
def load_ratings(path):
    COL_NAME = ['userId','movieId','rating','timestamp']
    df = pd.read_csv(os.path.join(path,"ratings.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

def load_movies(path):
    COL_NAME = ['movieId','title','genres']
    df = pd.read_csv(os.path.join(path,"movies.dat"),sep='::', header=None, engine='python', names=COL_NAME, encoding = 'ISO-8859-1' )
    return df

def load_users(path):
    COL_NAME = ['userId','gender','age','Occupation','zip_code']
    df = pd.read_csv(os.path.join(path,"users.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

In [3]:
#데이터 폴더 경로
DIR_PATH = "./datasets/"

#데이터 호출
users_df = Dataloader.load_users(DIR_PATH)
ratings_df = Dataloader.load_ratings(DIR_PATH)
movies_df = Dataloader.load_movies(DIR_PATH)

In [4]:
# users_df와 rating_df를 userid를 기준으로 병합
merged_df = pd.merge(ratings_df, users_df, on='userId', how='left')

# rating_df와 movie_df를 movieid를 기준으로 병합
final_merged_df = pd.merge(merged_df, movies_df, on='movieId', how='left')

In [5]:
final_merged_df['genres'] = final_merged_df['genres'].str.split('|')
final_merged_df

Unnamed: 0,userId,movieId,rating,timestamp,gender,age,Occupation,zip_code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),[Drama]
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[Animation, Children's, Musical]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[Musical, Romance]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),[Drama]
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[Animation, Children's, Comedy]"
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,25,6,11106,Weekend at Bernie's (1989),[Comedy]
1000205,6040,1094,5,956704887,M,25,6,11106,"Crying Game, The (1992)","[Drama, Romance, War]"
1000206,6040,562,5,956704746,M,25,6,11106,Welcome to the Dollhouse (1995),"[Comedy, Drama]"
1000207,6040,1096,4,956715648,M,25,6,11106,Sophie's Choice (1982),[Drama]


In [6]:
final_merged_df_expanded = final_merged_df.explode('genres')
final_merged_df_expanded

Unnamed: 0,userId,movieId,rating,timestamp,gender,age,Occupation,zip_code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Children's
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical
...,...,...,...,...,...,...,...,...,...,...
1000207,6040,1096,4,956715648,M,25,6,11106,Sophie's Choice (1982),Drama
1000208,6040,1097,4,956715569,M,25,6,11106,E.T. the Extra-Terrestrial (1982),Children's
1000208,6040,1097,4,956715569,M,25,6,11106,E.T. the Extra-Terrestrial (1982),Drama
1000208,6040,1097,4,956715569,M,25,6,11106,E.T. the Extra-Terrestrial (1982),Fantasy


In [7]:
# 'year' 열에 연도 정보 추출하여 추가
final_merged_df_expanded['year'] = final_merged_df_expanded['title'].str.extract(r'\((\d{4})\)')

# 연도 정보가 포함된 제목 수정
final_merged_df_expanded['title'] = final_merged_df_expanded['title'].str.replace(r'\s*\(\d{4}\)', '')
final_merged_df_expanded

Unnamed: 0,userId,movieId,rating,timestamp,gender,age,Occupation,zip_code,title,genres,year
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,1975
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation,1996
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Children's,1996
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Musical,1996
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical,1964
...,...,...,...,...,...,...,...,...,...,...,...
1000207,6040,1096,4,956715648,M,25,6,11106,Sophie's Choice (1982),Drama,1982
1000208,6040,1097,4,956715569,M,25,6,11106,E.T. the Extra-Terrestrial (1982),Children's,1982
1000208,6040,1097,4,956715569,M,25,6,11106,E.T. the Extra-Terrestrial (1982),Drama,1982
1000208,6040,1097,4,956715569,M,25,6,11106,E.T. the Extra-Terrestrial (1982),Fantasy,1982


In [8]:
final_merged_df_expanded.to_csv('merged_data.csv', index=False)