In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from functools import reduce
import re

## 1. 데이터 로딩

In [3]:
directors = pd.read_csv("./data/train/directors.tsv", delimiter="\t")
genres = pd.read_csv("./data/train/genres.tsv", delimiter="\t")
titles = pd.read_csv("./data/train/titles.tsv", delimiter="\t")
writers = pd.read_csv("./data/train/writers.tsv", delimiter="\t")
years = pd.read_csv("./data/train/years.tsv", delimiter="\t")

train = pd.read_csv("./data/train/train_ratings.csv")

### 1-1 감독, 장르, 제목, 작가, 년도로 나눠진 데이터 합치기

In [4]:
movie = reduce(
    lambda x, y: pd.merge(x, y, on="item", how="outer"),
    [directors, genres, titles, writers, years],
)
movie

Unnamed: 0,item,director,genre,title,writer,year
0,1237,nm0000005,Drama,"Seventh Seal, The (Sjunde inseglet, Det) (1957)",nm0000005,1957.0
1,5147,nm0000005,Drama,Wild Strawberries (Smultronstället) (1957),nm0000005,1957.0
2,7327,nm0000005,Drama,Persona (1966),nm0000005,1966.0
3,2068,nm0000005,Drama,Fanny and Alexander (Fanny och Alexander) (1982),nm0000005,1982.0
4,2068,nm0000005,Fantasy,Fanny and Alexander (Fanny och Alexander) (1982),nm0000005,1982.0
...,...,...,...,...,...,...
37669,100302,,Romance,Upside Down (2012),,2012.0
37670,100302,,Sci-Fi,Upside Down (2012),,2012.0
37671,65193,,Drama,Wild Child (2008),,2008.0
37672,65193,,Romance,Wild Child (2008),,2008.0


### 1-2 합쳐진 데이터를 영화의 item id로 압축하기

In [5]:
grouped_movie = (
    movie.fillna(0)
    .groupby("item")
    .agg(set)
    .reset_index()
    .applymap(
        lambda x: (
            (list(x)[0] if len(list(x)) == 1 else list(x)) if isinstance(x, set) else x
        )
    )
)
grouped_movie = grouped_movie.replace([0], np.NaN)
grouped_movie["year"] = grouped_movie.year.fillna(0).astype("Int64").replace(0, np.NaN)
grouped_movie

Unnamed: 0,item,director,genre,title,writer,year
0,1,nm0005124,"[Animation, Comedy, Children, Adventure, Fantasy]",Toy Story (1995),"[nm0005124, nm0004056, nm0230032, nm0710020, n...",1995
1,2,nm0002653,"[Children, Adventure, Fantasy]",Jumanji (1995),"[nm0378144, nm0885575, nm0852430]",1995
2,3,nm0222043,"[Romance, Comedy]",Grumpier Old Men (1995),nm0425756,1995
3,4,nm0001845,"[Drama, Romance, Comedy]",Waiting to Exhale (1995),nm0060103,1995
4,5,nm0796124,Comedy,Father of the Bride Part II (1995),"[nm0583600, nm0352443, nm0796124, nm0329304]",1995
...,...,...,...,...,...,...
6802,118700,,Drama,Selma (2014),,2014
6803,118900,nm0885249,Drama,Wild (2014),nm0394984,2014
6804,118997,nm0551128,"[Children, Musical, Comedy, Fantasy]",Into the Woods (2014),nm0487567,2014
6805,119141,"[nm1698571, nm0736622]","[Action, Comedy]",The Interview (2014),"[nm1698571, nm0736622]",2014


## 2. 결측치 탐색

In [6]:
isna = grouped_movie.isna().sum()
isna

item           0
director    1304
genre          0
title          0
writer      1159
year           8
dtype: int64

In [7]:
tmp = grouped_movie[grouped_movie["director"].isna()]
tmp = tmp.isna().sum()
tmp

item           0
director    1304
genre          0
title          0
writer       629
year           2
dtype: int64

### 2-1 년도 결측치 채우기

In [8]:
year_isna = grouped_movie[grouped_movie["year"].isna()]
year_isna

Unnamed: 0,item,director,genre,title,writer,year
1847,3310,nm0000122,"[Drama, Comedy]","Kid, The (1921)",nm0000122,
3455,6987,,"[Crime, Fantasy, Horror]","Cabinet of Dr. Caligari, The (Cabinet des Dr. ...",nm0562346,
3456,6988,nm0000428,"[Drama, Romance]",Broken Blossoms or The Yellow Man and the Girl...,nm0000428,
3513,7065,nm0000428,"[Drama, War]","Birth of a Nation, The (1915)","[nm0000428, nm0940488]",
3605,7243,nm0000428,Drama,Intolerance: Love's Struggle Throughout the Ag...,"[nm0000428, nm0002616, nm0115218, nm0940488]",
3948,8511,nm0000122,Comedy,"Immigrant, The (1917)",nm0000122,
4595,32898,,"[Adventure, Sci-Fi, Action, Fantasy]","Trip to the Moon, A (Voyage dans la lune, Le) ...","[nm0920229, nm0894523]",
6806,119145,nm0891216,"[Adventure, Crime, Action, Comedy]",Kingsman: The Secret Service (2015),"[nm0963359, nm2092839, nm1733301, nm0891216]",


In [9]:
grouped_movie["year"].fillna(
    grouped_movie["title"].apply(
        lambda x: (
            re.search(r"(\d{4})", x).group()
            if re.search(r"(\d{4})", x) and "year" in grouped_movie.columns
            else np.NaN
        )
    ),
    inplace=True,
)
grouped_movie

Unnamed: 0,item,director,genre,title,writer,year
0,1,nm0005124,"[Animation, Comedy, Children, Adventure, Fantasy]",Toy Story (1995),"[nm0005124, nm0004056, nm0230032, nm0710020, n...",1995
1,2,nm0002653,"[Children, Adventure, Fantasy]",Jumanji (1995),"[nm0378144, nm0885575, nm0852430]",1995
2,3,nm0222043,"[Romance, Comedy]",Grumpier Old Men (1995),nm0425756,1995
3,4,nm0001845,"[Drama, Romance, Comedy]",Waiting to Exhale (1995),nm0060103,1995
4,5,nm0796124,Comedy,Father of the Bride Part II (1995),"[nm0583600, nm0352443, nm0796124, nm0329304]",1995
...,...,...,...,...,...,...
6802,118700,,Drama,Selma (2014),,2014
6803,118900,nm0885249,Drama,Wild (2014),nm0394984,2014
6804,118997,nm0551128,"[Children, Musical, Comedy, Fantasy]",Into the Woods (2014),nm0487567,2014
6805,119141,"[nm1698571, nm0736622]","[Action, Comedy]",The Interview (2014),"[nm1698571, nm0736622]",2014


In [10]:
check_year = pd.DataFrame()
for i in year_isna["item"].tolist():
    check_year = pd.concat(
        [check_year.loc[:], grouped_movie[grouped_movie["item"] == i]]
    )
check_year

Unnamed: 0,item,director,genre,title,writer,year
1847,3310,nm0000122,"[Drama, Comedy]","Kid, The (1921)",nm0000122,1921
3455,6987,,"[Crime, Fantasy, Horror]","Cabinet of Dr. Caligari, The (Cabinet des Dr. ...",nm0562346,1920
3456,6988,nm0000428,"[Drama, Romance]",Broken Blossoms or The Yellow Man and the Girl...,nm0000428,1919
3513,7065,nm0000428,"[Drama, War]","Birth of a Nation, The (1915)","[nm0000428, nm0940488]",1915
3605,7243,nm0000428,Drama,Intolerance: Love's Struggle Throughout the Ag...,"[nm0000428, nm0002616, nm0115218, nm0940488]",1916
3948,8511,nm0000122,Comedy,"Immigrant, The (1917)",nm0000122,1917
4595,32898,,"[Adventure, Sci-Fi, Action, Fantasy]","Trip to the Moon, A (Voyage dans la lune, Le) ...","[nm0920229, nm0894523]",1902
6806,119145,nm0891216,"[Adventure, Crime, Action, Comedy]",Kingsman: The Secret Service (2015),"[nm0963359, nm2092839, nm1733301, nm0891216]",2015


## 3. 데이터 저장

### 3-1 train_ratings.csv에 grouped_movie 정보 merge

In [15]:
trains = pd.merge(train, grouped_movie, on="item", how="inner")
trains = trains.sort_values(by="user")
trains

Unnamed: 0,user,item,time,director,genre,title,writer,year
0,11,4643,1230782529,nm0000318,"[Adventure, Drama, Sci-Fi, Action]",Planet of the Apes (2001),"[nm0742797, nm0099541, nm0115310, nm0465199]",2001
324906,11,37830,1230788438,,"[Animation, Sci-Fi, Action, Adventure, Fantasy]",Final Fantasy VII: Advent Children (2004),nm0756983,2004
325641,11,60040,1230788448,nm0504642,"[Sci-Fi, Action]","Incredible Hulk, The (2008)","[nm0456158, nm0672015, nm0498278]",2008
327148,11,34319,1230788451,nm0000881,"[Sci-Fi, Thriller, Action]","Island, The (2005)","[nm0649460, nm1047021, nm0476064]",2005
329565,11,8644,1230788473,nm0001639,"[Adventure, Sci-Fi, Thriller, Action]","I, Robot (2004)","[nm0001920, nm0326040, nm0899113]",2004
...,...,...,...,...,...,...,...,...
2913055,138493,2085,1256750533,"[nm0314671, nm0718627, nm0527217]","[Children, Adventure, Animation]",101 Dalmatians (One Hundred and One Dalmatians...,"[nm0670328, nm0807977]",1961
477340,138493,8636,1258135133,nm0000600,"[Adventure, Sci-Fi, Action]",Spider-Man 2 (2004),"[nm0228492, nm0149290, nm0498278, nm0765091, n...",2004
1490967,138493,44022,1260209449,nm0757858,"[Children, Comedy, Adventure, Animation]",Ice Age 2: The Meltdown (2006),"[nm0310087, nm5022110, nm0841532]",2006
369728,138493,1748,1255805576,nm0001639,"[Film-Noir, Sci-Fi, Thriller, Adventure]",Dark City (1998),"[nm0275286, nm0229644]",1998


### 3-2 trains의 결측치

In [12]:
train_isna = trains.isna().sum()
train_isna

user             0
item             0
time             0
director    301445
genre            0
title            0
writer      312692
year             0
dtype: int64

In [13]:
tmp = trains[trains["writer"].isna()]
tmp = tmp.isna().sum()
tmp

user             0
item             0
time             0
director    118851
genre            0
title            0
writer      312692
year             0
dtype: int64

### 3-3 movie 정보와 user_movie interaction 정보 저장

In [14]:
grouped_movie.to_csv("./data/train/movie.csv", index=False)
trains.to_csv("./data/train/user_movie.csv", index=False)