In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [3]:
print('df_train 칼럼 :', df_train.columns)
print('df_test 칼럼 :', df_test.columns)

df_train 칼럼 : Index(['title', 'year', 'month', 'type', 'runtime', 'season_count',
       'episode_count', 'genre', 'director', 'cast', 'country', 'language',
       'production_company', 'synopsis', 'tags', 'imdb_rating',
       'imdb_review_count', 'text_input', 'production_company_is_missing',
       'director_is_missing', 'cast_is_missing', 'cast_award_score',
       'director_top_score', 'actor_top_score',
       'director_productivity_success', 'success_label'],
      dtype='object')
df_test 칼럼 : Index(['title', 'year', 'month', 'type', 'runtime', 'season_count',
       'episode_count', 'genre', 'director', 'cast', 'country', 'language',
       'production_company', 'synopsis', 'tags', 'imdb_rating',
       'imdb_review_count', 'text_input', 'production_company_is_missing',
       'cast_is_missing', 'director_is_missing', 'cast_award_score',
       'director_top_score', 'actor_top_score',
       'director_productivity_success'],
      dtype='object')


# 모델링 칼럼 설정

In [4]:
df_train = df_train[['title', 'month', 'type', 'runtime', 'season_count',
       'episode_count', 'genre', 'country', 'language', 'tags', 'production_company_is_missing',
       'director_is_missing', 'cast_is_missing', 'cast_award_score',
       'director_top_score', 'actor_top_score',
       'director_productivity_success', 'success_label']]

df_test = df_test[['title', 'month', 'type', 'runtime', 'season_count',
       'episode_count', 'genre', 'country', 'language', 'tags', 'production_company_is_missing',
       'director_is_missing', 'cast_is_missing', 'cast_award_score',
       'director_top_score', 'actor_top_score',
       'director_productivity_success']]

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63278 entries, 0 to 63277
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   title                          63278 non-null  object 
 1   month                          63278 non-null  int64  
 2   type                           63278 non-null  object 
 3   runtime                        63278 non-null  float64
 4   season_count                   63278 non-null  float64
 5   episode_count                  63278 non-null  float64
 6   genre                          63278 non-null  object 
 7   country                        63278 non-null  object 
 8   language                       63278 non-null  object 
 9   tags                           63278 non-null  object 
 10  production_company_is_missing  63278 non-null  int64  
 11  director_is_missing            63278 non-null  int64  
 12  cast_is_missing                63278 non-null 

In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1477 entries, 0 to 1476
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   title                          1477 non-null   object 
 1   month                          1477 non-null   int64  
 2   type                           1477 non-null   object 
 3   runtime                        1477 non-null   float64
 4   season_count                   1477 non-null   float64
 5   episode_count                  1477 non-null   float64
 6   genre                          1477 non-null   object 
 7   country                        1477 non-null   object 
 8   language                       1477 non-null   object 
 9   tags                           1477 non-null   object 
 10  production_company_is_missing  1477 non-null   int64  
 11  director_is_missing            1477 non-null   int64  
 12  cast_is_missing                1477 non-null   i

In [7]:
df_train.head()

Unnamed: 0,title,month,type,runtime,season_count,episode_count,genre,country,language,tags,production_company_is_missing,director_is_missing,cast_is_missing,cast_award_score,director_top_score,actor_top_score,director_productivity_success,success_label
0,Stepmom's Desire,5,movie,78.0,1.0,1.0,"Drama, Romance",KR,ko,softcore,1,0,0,0.0,0.0,0.0,0.0,1
1,Pizza Dare 1,4,movie,68.0,1.0,1.0,Romance,KR,ko,softcore,1,0,0,0.0,0.0,0.0,0.0,0
2,Bosomy Mom,5,movie,69.0,1.0,1.0,"Drama, Romance",KR,ko,softcore,1,0,0,0.0,0.0,0.0,0.0,0
3,Sonic the Hedgehog,2,movie,99.0,1.0,1.0,"Action, Science Fiction, Comedy, Family",US,en,"friendship, video game, san francisco, califor...",0,0,0,0.0,0.0,0.0,0.0,1
4,Demon Slayer -Kimetsu no Yaiba- The Movie: Mug...,10,movie,117.0,1.0,1.0,"Animation, Action, Fantasy, Thriller",JP,ja,"fight, magic, supernatural, psychology, gore, ...",0,0,0,0.0,0.0,0.0,0.0,1


# df_train

## type - onehot encoding

In [8]:
df_train['type'].value_counts(dropna=False)

type
movie    47290
tv       15988
Name: count, dtype: int64

In [9]:
import pandas as pd

# 원핫인코딩
type_encoded = pd.get_dummies(df_train['type'], prefix='type')

# True/False가 나왔을 때 1/0으로 변환 (대부분 불필요)
type_encoded = type_encoded.astype(int)

# 원본 데이터에 붙이기
df_train = pd.concat([df_train, type_encoded], axis=1)
df_train = df_train.drop(columns=['type'])


In [10]:
df_train.head()

Unnamed: 0,title,month,runtime,season_count,episode_count,genre,country,language,tags,production_company_is_missing,director_is_missing,cast_is_missing,cast_award_score,director_top_score,actor_top_score,director_productivity_success,success_label,type_movie,type_tv
0,Stepmom's Desire,5,78.0,1.0,1.0,"Drama, Romance",KR,ko,softcore,1,0,0,0.0,0.0,0.0,0.0,1,1,0
1,Pizza Dare 1,4,68.0,1.0,1.0,Romance,KR,ko,softcore,1,0,0,0.0,0.0,0.0,0.0,0,1,0
2,Bosomy Mom,5,69.0,1.0,1.0,"Drama, Romance",KR,ko,softcore,1,0,0,0.0,0.0,0.0,0.0,0,1,0
3,Sonic the Hedgehog,2,99.0,1.0,1.0,"Action, Science Fiction, Comedy, Family",US,en,"friendship, video game, san francisco, califor...",0,0,0,0.0,0.0,0.0,0.0,1,1,0
4,Demon Slayer -Kimetsu no Yaiba- The Movie: Mug...,10,117.0,1.0,1.0,"Animation, Action, Fantasy, Thriller",JP,ja,"fight, magic, supernatural, psychology, gore, ...",0,0,0,0.0,0.0,0.0,0.0,1,1,0


In [11]:
df_train.columns

Index(['title', 'month', 'runtime', 'season_count', 'episode_count', 'genre',
       'country', 'language', 'tags', 'production_company_is_missing',
       'director_is_missing', 'cast_is_missing', 'cast_award_score',
       'director_top_score', 'actor_top_score',
       'director_productivity_success', 'success_label', 'type_movie',
       'type_tv'],
      dtype='object')

In [12]:
df_train = df_train[['title', 'month', 'type_movie','type_tv', 'runtime', 'season_count', 'episode_count', 'genre',
       'country', 'language', 'tags', 'production_company_is_missing', 'director_is_missing', 'cast_is_missing', 'cast_award_score',
       'director_top_score', 'actor_top_score', 'director_productivity_success', 'success_label']]
df_train.head()

Unnamed: 0,title,month,type_movie,type_tv,runtime,season_count,episode_count,genre,country,language,tags,production_company_is_missing,director_is_missing,cast_is_missing,cast_award_score,director_top_score,actor_top_score,director_productivity_success,success_label
0,Stepmom's Desire,5,1,0,78.0,1.0,1.0,"Drama, Romance",KR,ko,softcore,1,0,0,0.0,0.0,0.0,0.0,1
1,Pizza Dare 1,4,1,0,68.0,1.0,1.0,Romance,KR,ko,softcore,1,0,0,0.0,0.0,0.0,0.0,0
2,Bosomy Mom,5,1,0,69.0,1.0,1.0,"Drama, Romance",KR,ko,softcore,1,0,0,0.0,0.0,0.0,0.0,0
3,Sonic the Hedgehog,2,1,0,99.0,1.0,1.0,"Action, Science Fiction, Comedy, Family",US,en,"friendship, video game, san francisco, califor...",0,0,0,0.0,0.0,0.0,0.0,1
4,Demon Slayer -Kimetsu no Yaiba- The Movie: Mug...,10,1,0,117.0,1.0,1.0,"Animation, Action, Fantasy, Thriller",JP,ja,"fight, magic, supernatural, psychology, gore, ...",0,0,0,0.0,0.0,0.0,0.0,1


In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63278 entries, 0 to 63277
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   title                          63278 non-null  object 
 1   month                          63278 non-null  int64  
 2   type_movie                     63278 non-null  int32  
 3   type_tv                        63278 non-null  int32  
 4   runtime                        63278 non-null  float64
 5   season_count                   63278 non-null  float64
 6   episode_count                  63278 non-null  float64
 7   genre                          63278 non-null  object 
 8   country                        63278 non-null  object 
 9   language                       63278 non-null  object 
 10  tags                           63278 non-null  object 
 11  production_company_is_missing  63278 non-null  int64  
 12  director_is_missing            63278 non-null 

In [14]:
# 쉼표 기준으로 split 후 모두 펼치기
all_genres = df_train['genre'].str.split(', ').explode()

# 고유 장르 종류 수
num_unique_genres = all_genres.nunique()
print(f"장르 종류 수: {num_unique_genres}")

# 고유 장르 목록
unique_genres = all_genres.unique()
print("장르 목록:", unique_genres)


장르 종류 수: 27
장르 목록: ['Drama' 'Romance' 'Action' 'Science Fiction' 'Comedy' 'Family'
 'Animation' 'Fantasy' 'Thriller' 'Adventure' 'Crime' 'War' 'Horror'
 'Documentary' 'Mystery' 'Music' 'History' 'TV Movie' 'Western' 'Soap'
 'Talk' 'Reality' 'Kids' 'News' 'Sci-Fi & Fantasy' 'Action & Adventure'
 'War & Politics']


In [15]:
# 쉼표 기준으로 split 후 모두 펼치기
all_genres = df_train['tags'].str.split(', ').explode()

# 고유 장르 종류 수
num_unique_genres = all_genres.nunique()
print(f"tags 종류 수: {num_unique_genres}")

# 고유 장르 목록
unique_genres = all_genres.unique()
print("tags 목록:", unique_genres)


tags 종류 수: 21367
tags 목록: ['softcore' 'friendship' 'video game' ... 'pickles' 'ben vasion' 'vasion']


- tags 차원 축소 필요(CountVectorizer)

In [16]:
# 쉼표 기준으로 split 후 모두 펼치기
all_genres = df_train['language'].str.split(', ').explode()

# 고유 장르 종류 수
num_unique_genres = all_genres.nunique()
print(f"언어 종류 수: {num_unique_genres}")

# 고유 장르 목록
unique_genres = all_genres.unique()
print("언어 목록:", unique_genres)


언어 종류 수: 127
언어 목록: ['ko' 'en' 'ja' 'pl' 'ru' 'da' 'zh' 'fr' 'te' 'de' 'es' 'it' 'cn' 'hi'
 'ta' 'mr' 'nl' 'ar' 'no' 'pa' 'th' 'sv' 'id' 'hu' 'pt' 'fi' 'kn' 'eu'
 'tr' 'hr' 'ml' 'uk' 'sr' 'cs' 'fa' 'ca' 'ka' 'tl' 'xx' 'bn' 'xh' 'lt'
 'ku' 'mk' 'is' 'af' 'st' 'el' 'kk' 'ro' 'sl' 'ms' 'sk' 'et' 'lv' 'he'
 'wo' 'bg' 'my' 'si' 'hy' 'vi' 'ne' 'gl' 'lo' 'az' 'gu' 'sw' 'ga' 'mi'
 'uz' 'ln' 'nb' 'ug' 'ak' 'ha' 'jv' 'mn' 'or' 'be' 'yo' 'ky' 'ki' 'dz'
 'km' 'bs' 'bo' 'ur' 'la' 'am' 'zu' 'sq' 'cy' 'mt' 'ps' 'os' 'om' 'tg'
 'ab' 'nn' 'se' 'su' 'nd' 'ht' 'gn' 'fy' 'kl' 'ho' 'tt' 'lb' 'yi' 'ff'
 'eo' 'ig' 'qu' 'mg' 'ks' 'sh' 'as' 'so' 'mo' 'rm' 'gd' 'sd' 'fo' 'tn'
 'sn']


In [17]:
# 쉼표 기준으로 split 후 모두 펼치기
all_genres = df_train['country'].str.split(', ').explode()

# 고유 장르 종류 수
num_unique_genres = all_genres.nunique()
print(f"나라 종류 수: {num_unique_genres}")

# 고유 장르 목록
unique_genres = all_genres.unique()
print("나라 목록:", unique_genres)


나라 종류 수: 218
나라 목록: ['KR' 'US' 'JP' 'GB' 'CA' 'PL' 'RU' 'CN' 'SG' 'AU' 'DE' 'DK' 'FR' 'NZ'
 'IN' 'FI' 'SE' 'IE' 'LU' 'IT' 'TW' 'BE' 'ES' 'CL' 'AT' 'CH' 'NO' 'HU'
 'BY' 'HK' 'AR' 'NL' 'AE' 'ZA' 'SA' 'UA' 'TH' 'ID' 'MU' 'PT' 'MX' 'BR'
 'TR' 'HR' 'IS' 'RS' 'GE' 'AO' 'CZ' 'IR' 'EG' 'TN' 'PR' 'LK' 'PH' 'YU'
 'BD' 'PS' 'EE' 'LT' 'MK' 'GH' 'GR' 'KZ' 'UY' 'RO' 'SI' 'MY' 'SD' 'IL'
 'ST' 'BG' 'CM' 'SK' 'kn' 'LV' 'CO' 'SV' 'MA' 'NG' 'GT' 'MM' 'AM' 'PE'
 'VE' 'EC' 'VN' 'NE' 'KH' 'LA' 'ET' 'KW' 'XH' 'AN' 'JO' 'ZW' 'UG' 'XI'
 'UZ' 'PK' 'LN' 'QA' 'LB' 'DO' 'CU' 'MN' 'DZ' 'BH' 'en' 'KG' 'KM' 'XK'
 'AZ' 'NP' 'BA' 'AF' 'xx' 'MC' 'BO' 'GU' 'hi' 'GI' 'zh' 'ja' 'IQ' 'ko'
 'ar' 'lv' 'fi' 'it' 'te' 'TO' 'fr' 'SY' 'IO' 'ru' 'tr' 'de' 'pt' 'fa'
 'es' 'nl' 'eu' 'th' 'is' 'CY' 'sv' 'MD' 'no' 'da' 'CI' 'AL' 'HN' 'TD'
 'MT' 'PA' 'CR' 'bn' 'ur' 'MO' 'TJ' 'LY' 'TT' 'UR' 'MW' 'CD' 'BT' 'TZ'
 'nn' 'KE' 'OM' 'SL' 'FO' 'el' 'CG' 'GA' 'ro' 'SN' 'KP' 'KN' 'PY' 'HT'
 'BB' 'ML' 'SS' 'TG' 'NI' 'GL' 'ME' 'ms' 'BF' 'RE' 'sr' '

- 상위 n개 나누거나 grouping 고려
  - 상위 20개 + other 통일...?

# df_test

## type - onehotencoding

In [18]:
df_test['type'].value_counts(dropna=False)

type
movie    1404
tv         73
Name: count, dtype: int64

In [19]:
# 원핫인코딩
type_encoded = pd.get_dummies(df_test['type'], prefix='type')

# True/False가 나왔을 때 1/0으로 변환 (대부분 불필요)
type_encoded = type_encoded.astype(int)

# 원본 데이터에 붙이기
df_test = pd.concat([df_test, type_encoded], axis=1)
df_test = df_test.drop(columns=['type'])

df_test.head()

Unnamed: 0,title,month,runtime,season_count,episode_count,genre,country,language,tags,production_company_is_missing,director_is_missing,cast_is_missing,cast_award_score,director_top_score,actor_top_score,director_productivity_success,type_movie,type_tv
0,STRAW,6,105.0,1.0,1.0,"Thriller, Drama, Crime",US,en,"angry, aggressive, hopeless, anxious, provocat...",0,0,0,0.0,0.0,0.0,8.0,1,0
1,Predator: Killer of Killers,6,85.0,1.0,1.0,"Animation, Action, Science Fiction",US,en,"world war ii, pilot, vikings (norsemen), antho...",0,0,0,0.0,0.0,0.2,1.0,1,0
2,How to Train Your Dragon,6,125.0,1.0,1.0,"Action, Family, Fantasy",US,en,"flying, vikings (norsemen), remake, dragon, ba...",0,0,0,0.0,0.0,0.0,0.0,1,0
3,Deep Cover,6,99.0,1.0,1.0,"Action, Comedy, Crime",US,en,"london, england, sting operation, met police, ...",0,0,0,0.0,0.0,0.2,0.0,1,0
4,K.O.,6,84.0,1.0,1.0,"Action, Drama, Adventure",FR,fr,"action, thriller, startup, drama, historical, ...",0,0,0,0.0,0.0,0.0,0.0,1,0


In [20]:
df_test.columns

Index(['title', 'month', 'runtime', 'season_count', 'episode_count', 'genre',
       'country', 'language', 'tags', 'production_company_is_missing',
       'director_is_missing', 'cast_is_missing', 'cast_award_score',
       'director_top_score', 'actor_top_score',
       'director_productivity_success', 'type_movie', 'type_tv'],
      dtype='object')

In [21]:
df_test = df_test[['title', 'month', 'type_movie','type_tv', 'runtime', 'season_count', 'episode_count', 'genre',
       'country', 'language', 'tags', 'production_company_is_missing', 'director_is_missing', 'cast_is_missing', 'cast_award_score',
       'director_top_score', 'actor_top_score', 'director_productivity_success']]
df_test.head()

Unnamed: 0,title,month,type_movie,type_tv,runtime,season_count,episode_count,genre,country,language,tags,production_company_is_missing,director_is_missing,cast_is_missing,cast_award_score,director_top_score,actor_top_score,director_productivity_success
0,STRAW,6,1,0,105.0,1.0,1.0,"Thriller, Drama, Crime",US,en,"angry, aggressive, hopeless, anxious, provocat...",0,0,0,0.0,0.0,0.0,8.0
1,Predator: Killer of Killers,6,1,0,85.0,1.0,1.0,"Animation, Action, Science Fiction",US,en,"world war ii, pilot, vikings (norsemen), antho...",0,0,0,0.0,0.0,0.2,1.0
2,How to Train Your Dragon,6,1,0,125.0,1.0,1.0,"Action, Family, Fantasy",US,en,"flying, vikings (norsemen), remake, dragon, ba...",0,0,0,0.0,0.0,0.0,0.0
3,Deep Cover,6,1,0,99.0,1.0,1.0,"Action, Comedy, Crime",US,en,"london, england, sting operation, met police, ...",0,0,0,0.0,0.0,0.2,0.0
4,K.O.,6,1,0,84.0,1.0,1.0,"Action, Drama, Adventure",FR,fr,"action, thriller, startup, drama, historical, ...",0,0,0,0.0,0.0,0.0,0.0


In [22]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1477 entries, 0 to 1476
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   title                          1477 non-null   object 
 1   month                          1477 non-null   int64  
 2   type_movie                     1477 non-null   int32  
 3   type_tv                        1477 non-null   int32  
 4   runtime                        1477 non-null   float64
 5   season_count                   1477 non-null   float64
 6   episode_count                  1477 non-null   float64
 7   genre                          1477 non-null   object 
 8   country                        1477 non-null   object 
 9   language                       1477 non-null   object 
 10  tags                           1477 non-null   object 
 11  production_company_is_missing  1477 non-null   int64  
 12  director_is_missing            1477 non-null   i

In [23]:
# 쉼표 기준으로 split 후 모두 펼치기
all_genres = df_test['genre'].str.split(', ').explode()

# 고유 장르 종류 수
num_unique_genres = all_genres.nunique()
print(f"장르 종류 수: {num_unique_genres}")

# 고유 장르 목록
unique_genres = all_genres.unique()
print("장르 목록:", unique_genres)


장르 종류 수: 25
장르 목록: ['Thriller' 'Drama' 'Crime' 'Animation' 'Action' 'Science Fiction'
 'Family' 'Fantasy' 'Comedy' 'Adventure' 'Romance' 'Horror' 'Mystery'
 'Documentary' 'Western' 'History' 'Music' 'TV Movie' 'War'
 'Action & Adventure' 'Sci-Fi & Fantasy' 'War & Politics' 'Soap' 'Reality'
 'Kids']


In [24]:
# 쉼표 기준으로 split 후 모두 펼치기
all_genres = df_test['tags'].str.split(', ').explode()

# 고유 장르 종류 수
num_unique_genres = all_genres.nunique()
print(f"tags 종류 수: {num_unique_genres}")

# 고유 장르 목록
unique_genres = all_genres.unique()
print("tags 목록:", unique_genres)


tags 종류 수: 2079
tags 목록: ['angry' 'aggressive' 'hopeless' ... 'solo knacks' 'studio feedfront'
 'headcrackin']


In [25]:
# 쉼표 기준으로 split 후 모두 펼치기
all_genres = df_test['language'].str.split(', ').explode()

# 고유 장르 종류 수
num_unique_genres = all_genres.nunique()
print(f"언어 종류 수: {num_unique_genres}")

# 고유 장르 목록
unique_genres = all_genres.unique()
print("언어 목록:", unique_genres)


언어 종류 수: 60
언어 목록: ['en' 'fr' 'bn' 'es' 'ar' 'hi' 'ta' 'id' 'zh' 'ko' 'tr' 'te' 'ur' 'tl'
 'no' 'th' 'sv' 'fa' 'de' 'or' 'ja' 'ml' 'pt' 'pa' 'ru' 'ms' 'sr' 'cn'
 'he' 'et' 'da' 'ka' 'ro' 'nl' 'vi' 'el' 'it' 'lb' 'cs' 'sq' 'uk' 'ca'
 'fi' 'kn' 'mn' 'sk' 'pl' 'lv' 'hy' 'xx' 'lt' 'ne' 'mi' 'kk' 'eu' 'is'
 'as' 'nb' 'km' 'fj']


In [26]:
# 쉼표 기준으로 split 후 모두 펼치기
all_genres = df_test['country'].str.split(', ').explode()

# 고유 장르 종류 수
num_unique_genres = all_genres.nunique()
print(f"나라 종류 수: {num_unique_genres}")

# 고유 장르 목록
unique_genres = all_genres.unique()
print("나라 목록:", unique_genres)

나라 종류 수: 91
나라 목록: ['US' 'FR' 'BD' 'MX' 'GB' 'SA' 'IN' 'ID' 'CN' 'HK' 'KR' 'TR' 'PK' 'ES'
 'PH' 'NO' 'TH' 'SE' 'IR' 'DE' 'CA' 'JP' 'BR' 'EE' 'IT' 'RU' 'MY' 'BE'
 'SG' 'IE' 'LU' 'AR' 'DK' 'FI' 'DZ' 'RS' 'AU' 'NZ' 'TW' 'en' 'CH' 'EG'
 'NG' 'JO' 'ZA' 'CY' 'IL' 'PT' 'CO' 'PE' 'UY' 'DO' 'NL' 'AT' 'GE' 'RO'
 'CZ' 'CL' 'GA' 'VN' 'PS' 'GR' 'QA' 'SK' 'XK' 'MK' 'PL' 'UA' 'HR' 'GY'
 'LT' 'LV' 'MN' 'ZW' 'AM' 'GT' 'PR' 'NP' 'IS' 'EC' 'KZ' 'SI' 'CR' 'fa'
 'MO' 'IQ' 'HN' 'ME' 'UG' 'FJ' 'SN']


In [27]:
# 1. null 처리
df_train['genre'] = df_train['genre'].fillna('')
df_test['genre'] = df_test['genre'].fillna('')

# 2. 리스트 변환
df_train['genre_list'] = df_train['genre'].apply(lambda x: x.split(',') if x != '' else [])
df_test['genre_list'] = df_test['genre'].apply(lambda x: x.split(',') if x != '' else [])

# 3. MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_encoded_train = mlb.fit_transform(df_train['genre_list'])
genre_encoded_test = mlb.transform(df_test['genre_list'])

# 4. 컬럼명 생성 및 DataFrame 변환
genre_colnames = [f'genre_{g}' for g in mlb.classes_]
df_genre_train = pd.DataFrame(genre_encoded_train, columns=genre_colnames, index=df_train.index)
df_genre_test = pd.DataFrame(genre_encoded_test, columns=genre_colnames, index=df_test.index)

# 5. 붙이기
df_train = pd.concat([df_train, df_genre_train], axis=1)
df_test = pd.concat([df_test, df_genre_test], axis=1)

# 6. 기존 컬럼 제거
df_train = df_train.drop(columns=['genre', 'genre_list'])
df_test = df_test.drop(columns=['genre', 'genre_list'])


In [28]:
df_train.shape

(63278, 72)

In [29]:
df_test.shape

(1477, 71)

In [30]:
# 1. tags 컬럼 결측값 처리 (빈 문자열로)
df_train['tags'] = df_train['tags'].fillna('')
df_test['tags'] = df_test['tags'].fillna('')

# 2. CountVectorizer 생성 (필요에 따라 max_features 조절 가능)
vectorizer = CountVectorizer(max_features=5000)  # 예: 최대 5000개 단어 사용

# 3. train tags 텍스트를 CountVectorizer로 fit_transform
tags_train_counts = vectorizer.fit_transform(df_train['tags'])

# 4. test tags 텍스트는 transform만 수행
tags_test_counts = vectorizer.transform(df_test['tags'])

# 5. TruncatedSVD 생성 (차원 수는 적절히 조절, 예: 100)
svd = TruncatedSVD(n_components=100, random_state=42)

# 6. train tags 차원 축소
tags_train_svd = svd.fit_transform(tags_train_counts)

# 7. test tags 차원 축소
tags_test_svd = svd.transform(tags_test_counts)

# 8. SVD 결과를 DataFrame으로 변환 (컬럼명 지정)
svd_colnames = [f'tags_svd_{i}' for i in range(tags_train_svd.shape[1])]
df_tags_train = pd.DataFrame(tags_train_svd, columns=svd_colnames, index=df_train.index)
df_tags_test = pd.DataFrame(tags_test_svd, columns=svd_colnames, index=df_test.index)

# 9. 원본 데이터프레임에 붙이기
df_train = pd.concat([df_train, df_tags_train], axis=1)
df_test = pd.concat([df_test, df_tags_test], axis=1)

# 10. 필요하면 원본 'tags' 컬럼 제거
df_train = df_train.drop(columns=['tags'])
df_test = df_test.drop(columns=['tags'])


In [31]:
df_train.shape

(63278, 171)

In [32]:
df_test.shape

(1477, 170)

In [33]:
import pandas as pd

# 결측값 처리
df_train['country'] = df_train['country'].fillna('')
df_test['country'] = df_test['country'].fillna('')

# 상위 30개 country 추출
top_30_countries = df_train['country'].value_counts().nlargest(30).index.tolist()

def map_country(c):
    return c if c in top_30_countries else 'others'

# 매핑 적용
df_train['country_mapped'] = df_train['country'].apply(map_country)
df_test['country_mapped'] = df_test['country'].apply(map_country)

# 원핫 인코딩
country_dummies_train = pd.get_dummies(df_train['country_mapped'], prefix='country')
country_dummies_test = pd.get_dummies(df_test['country_mapped'], prefix='country')

# df_train에 없는 컬럼이 test에 없을 수 있으니, test에 train 컬럼을 맞춤
missing_cols = set(country_dummies_train.columns) - set(country_dummies_test.columns)
for c in missing_cols:
    country_dummies_test[c] = 0

# 컬럼 순서 train 기준으로 맞추기
country_dummies_test = country_dummies_test[country_dummies_train.columns]

# 데이터프레임에 붙이기
df_train = pd.concat([df_train, country_dummies_train], axis=1)
df_test = pd.concat([df_test, country_dummies_test], axis=1)

# 기존 컬럼 제거
df_train = df_train.drop(columns=['country', 'country_mapped'])
df_test = df_test.drop(columns=['country', 'country_mapped'])


In [34]:
print(df_train.shape)
print(df_test.shape)

(63278, 201)
(1477, 200)


In [35]:
success = df_train['success_label']
df_train = df_train.drop(columns=['success_label'])
df_train['success_label'] = success

In [38]:
# 모든 bool 타입 컬럼을 int로 변환
bool_cols = df_train.select_dtypes(include='bool').columns
df_train[bool_cols] = df_train[bool_cols].astype(int)

bool_cols_test = df_test.select_dtypes(include='bool').columns
df_test[bool_cols_test] = df_test[bool_cols_test].astype(int)

In [40]:
df_train.to_csv('./data/df_train.csv', index=False)
df_test.to_csv('./data/df_test.csv', index=False)

In [39]:
df_train.head()

Unnamed: 0,title,month,type_movie,type_tv,runtime,season_count,episode_count,language,production_company_is_missing,director_is_missing,...,country_PL,country_RU,country_SE,country_TH,country_TR,country_TW,country_US,"country_US, GB",country_others,success_label
0,Stepmom's Desire,5,1,0,78.0,1.0,1.0,ko,1,0,...,0,0,0,0,0,0,0,0,0,1
1,Pizza Dare 1,4,1,0,68.0,1.0,1.0,ko,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Bosomy Mom,5,1,0,69.0,1.0,1.0,ko,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Sonic the Hedgehog,2,1,0,99.0,1.0,1.0,en,0,0,...,0,0,0,0,0,0,1,0,0,1
4,Demon Slayer -Kimetsu no Yaiba- The Movie: Mug...,10,1,0,117.0,1.0,1.0,ja,0,0,...,0,0,0,0,0,0,0,0,0,1
