In [63]:
from utils import Dataloader
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder

In [64]:
def load_ratings(path):
    COL_NAME = ['userId','movieId','rating','timestamp']
    df = pd.read_csv(os.path.join(path,"ratings.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

def load_movies(path):
    COL_NAME = ['movieId','title','genres']
    df = pd.read_csv(os.path.join(path,"movies.dat"),sep='::', header=None, engine='python', names=COL_NAME, encoding = 'ISO-8859-1' )
    return df

def load_users(path):
    COL_NAME = ['userId','gender','age','Occupation','zip_code']
    df = pd.read_csv(os.path.join(path,"users.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

In [65]:
#데이터 폴더 경로
DIR_PATH = "./data/"

#데이터 호출
users_df = Dataloader.load_users(DIR_PATH)
ratings_df = Dataloader.load_ratings(DIR_PATH)
movies_df = Dataloader.load_movies(DIR_PATH)

In [66]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [67]:
# 장르 분할
movies_df['genres'] = movies_df['genres'].str.split('|')
edited_movies_df = movies_df.explode('genres')
edited_movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children's
0,1,Toy Story (1995),Comedy
1,2,Jumanji (1995),Adventure
1,2,Jumanji (1995),Children's
...,...,...,...
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama


In [68]:
# 장르를 label encoding
genres = edited_movies_df['genres']
# LabelEncoder 객체 생성
encoder = LabelEncoder()

# 범주형 데이터에 레이블 인코딩 적용
encoded_genres = encoder.fit_transform(genres)
encoded_genres = pd.DataFrame(encoded_genres, columns=['genres'])
print(encoded_genres)
edited_movies_df['genres'] = encoded_genres

      genres
0          2
1          3
2          4
3          1
4          3
...      ...
6403       7
6404       7
6405       7
6406       7
6407      15

[6408 rows x 1 columns]


In [69]:
edited_movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),2
0,1,Toy Story (1995),2
0,1,Toy Story (1995),2
1,2,Jumanji (1995),3
1,2,Jumanji (1995),3
...,...,...,...
3879,3949,Requiem for a Dream (2000),15
3880,3950,Tigerland (2000),4
3881,3951,Two Family House (2000),13
3882,3952,"Contender, The (2000)",13


In [70]:
# 'year' 열에 연도 정보 추출하여 추가
edited_movies_df['year'] = edited_movies_df['title'].str.extract(r'\((\d{4})\)')

# 연도 정보가 포함된 제목 수정
edited_movies_df['title'] = edited_movies_df['title'].str.replace(pat=r'[^\w\s]', repl=r'', regex=True)
edited_movies_df['title'] = edited_movies_df['title'].str.replace(pat=r'[0-9]', repl=r'', regex=True)

# 연대 컬럼 삽입
edited_movies_df["year_term"] = edited_movies_df["year"].apply(lambda x : x[-4 :-1] + "0")
edited_movies_df

edited_movies_df

Unnamed: 0,movieId,title,genres,year,year_term
0,1,Toy Story,2,1995,1990
0,1,Toy Story,2,1995,1990
0,1,Toy Story,2,1995,1990
1,2,Jumanji,3,1995,1990
1,2,Jumanji,3,1995,1990
...,...,...,...,...,...
3879,3949,Requiem for a Dream,15,2000,2000
3880,3950,Tigerland,4,2000,2000
3881,3951,Two Family House,13,2000,2000
3882,3952,Contender The,13,2000,2000


In [71]:
edited_movies_df[['year', 'year_term']] = edited_movies_df[['year', 'year_term']].astype(int)
edited_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6408 entries, 0 to 3882
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movieId    6408 non-null   int64 
 1   title      6408 non-null   object
 2   genres     6408 non-null   int64 
 3   year       6408 non-null   int64 
 4   year_term  6408 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 300.4+ KB


In [72]:
users_df

Unnamed: 0,userId,gender,age,Occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [73]:
# 성별을 0, 1 변환
gender = users_df['gender']
encoded_gender = encoder.fit_transform(gender)
encoded_gender = pd.DataFrame(encoded_gender, columns=['gender'])

# 인코딩된 결과 출력
print(encoded_gender)

# 인코딩한 결과 데이터프레임에 반영
users_df['gender'] = encoded_gender
users_df

      gender
0          0
1          1
2          1
3          1
4          1
...      ...
6035       0
6036       0
6037       0
6038       0
6039       1

[6040 rows x 1 columns]


Unnamed: 0,userId,gender,age,Occupation,zip_code
0,1,0,1,10,48067
1,2,1,56,16,70072
2,3,1,25,15,55117
3,4,1,45,7,02460
4,5,1,25,20,55455
...,...,...,...,...,...
6035,6036,0,25,15,32603
6036,6037,0,45,1,76006
6037,6038,0,56,1,14706
6038,6039,0,45,0,01060


In [74]:
# State 컬럼 추가
# zip_code의 첫번째 자릿수 혹은 하이픈(-) 전의 숫자가 State를 의미
users_df['state'] = users_df['zip_code'].apply(lambda x: x.split('-')[0] if '-' in x else x[0])
users_df

Unnamed: 0,userId,gender,age,Occupation,zip_code,state
0,1,0,1,10,48067,4
1,2,1,56,16,70072,7
2,3,1,25,15,55117,5
3,4,1,45,7,02460,0
4,5,1,25,20,55455,5
...,...,...,...,...,...,...
6035,6036,0,25,15,32603,3
6036,6037,0,45,1,76006,7
6037,6038,0,56,1,14706,1
6038,6039,0,45,0,01060,0


In [75]:
users_df['state'].value_counts()

state
9        1455
0         657
5         653
1         653
4         602
         ... 
88005       1
95451       1
80004       1
50265       1
78705       1
Name: count, Length: 73, dtype: int64

In [76]:
users_df['state'] = users_df['state'].astype(int)
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   userId      6040 non-null   int64 
 1   gender      6040 non-null   int64 
 2   age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   zip_code    6040 non-null   object
 5   state       6040 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 283.3+ KB


In [77]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   userId     1000209 non-null  int64
 1   movieId    1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [79]:
users_df.to_csv('./data/users.csv', index=False)
edited_movies_df.to_csv('./data/movies.csv', index=False)
ratings_df.to_csv('./data/ratings.csv', index=False)