## Disney+ Movies and TV Shows
+ Dataset Source - Kaggle Dataset

#### Content-based filtering recommender systems implemented

- 데이터 로드

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('./disney_plus_titles.csv')

- 데이터 정보 확인

In [3]:
data.head(n=3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",,"November 26, 2021",2016,TV-G,23 min,"Animation, Family",Join Mickey and the gang as they duck the halls!
1,s2,Movie,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale",,"November 26, 2021",1988,PG,91 min,Comedy,Santa Claus passes his magic bag to a new St. ...
2,s3,Movie,Ice Age: A Mammoth Christmas,Karen Disher,"Raymond Albert Romano, John Leguizamo, Denis L...",United States,"November 26, 2021",2011,TV-G,23 min,"Animation, Comedy, Family",Sid the Sloth is on Santa's naughty list.


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       1450 non-null   object
 1   type          1450 non-null   object
 2   title         1450 non-null   object
 3   director      977 non-null    object
 4   cast          1260 non-null   object
 5   country       1231 non-null   object
 6   date_added    1447 non-null   object
 7   release_year  1450 non-null   int64 
 8   rating        1447 non-null   object
 9   duration      1450 non-null   object
 10  listed_in     1450 non-null   object
 11  description   1450 non-null   object
dtypes: int64(1), object(11)
memory usage: 136.1+ KB


In [5]:
data.isna().sum()

show_id           0
type              0
title             0
director        473
cast            190
country         219
date_added        3
release_year      0
rating            3
duration          0
listed_in         0
description       0
dtype: int64

In [6]:
data.shape

(1450, 12)

In [7]:
data.nunique()

show_id         1450
type               2
title           1450
director         609
cast            1193
country           89
date_added       167
release_year      90
rating             9
duration         158
listed_in        329
description     1448
dtype: int64

In [8]:
data.rename(columns = {'listed_in' : 'genre'}, inplace=True)

- column 추출
 - 추천의 기반이 되는 column 추출
 - 선택한 column에 null 값이 있을 시 해당 행 제거

In [45]:
def data_null_check(data=None, choice_columns=None):
    data_copy = data[choice_columns]
    
    if False in sum(data_copy.isna().values.tolist(), []):
        data_copy = data_copy.dropna().reset_index(drop=True).copy()
    else:
        data_copy = data_copy.copy()
    
    return data_copy

#### 장르, 컨텐츠 설명 기반 추천
 - genre, description column 추출

In [37]:
data_copy = data_null_check(data=data, choice_columns=['title','genre', 'description'])
data_copy.head(n=3)

Unnamed: 0,title,genre,description
0,Duck the Halls: A Mickey Mouse Christmas Special,"Animation, Family",Join Mickey and the gang as they duck the halls!
1,Ernest Saves Christmas,Comedy,Santa Claus passes his magic bag to a new St. ...
2,Ice Age: A Mammoth Christmas,"Animation, Comedy, Family",Sid the Sloth is on Santa's naughty list.


- 카테고리 데이터 인코딩
  - genre

In [38]:
import numpy as np

def to_vector(data=None, choice_category=None):
    
    data_copy = data[choice_category]
    return_df = pd.DataFrame()
    
    for category in choice_category:
        location = list(set(sum(list(data_copy[category].str.split(', ')), [])))
        category_to_vec = np.zeros((len(data_copy) , len(location)))

        for i, key in enumerate(list(data_copy[category].str.split(', '))):
            for k in key:
                category_to_vec[i][location.index(k)] = 1

        vector_df = pd.DataFrame(columns=location, data=category_to_vec)
        return_df = pd.concat([return_df, vector_df], axis=1)
    
    return return_df

In [39]:
genre_df = to_vector(data=data_copy, choice_category=['genre'])
genre_df.head(n=3)

Unnamed: 0,Superhero,Romantic Comedy,Family,Action-Adventure,Crime,Animals & Nature,Talk Show,Coming of Age,Police/Cop,Dance,...,Fantasy,Science Fiction,Buddy,Survival,Travel,Thriller,Animation,Kids,Game Show / Competition,Anime
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


- description 인코딩
 - description 문자열 내 숫자 제거

In [40]:
import re
from string import digits
from sklearn.feature_extraction.text import TfidfVectorizer

def description_encoder(data=None):
    for cnt in range(len(data)):
        temp_str = data.description[cnt:cnt+1].str.lower().str.cat(sep=' ')
        temp_str = re.sub('[^A-Za-z]+', ' ', temp_str)

        newstring = temp_str.translate(digits)

        data.loc[cnt, 'description']= newstring

    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', token_pattern=r"(?u)\b\w\w+\b")
    tfidf_vectorizer.fit(data.description.values)
    tfidf_feature = tfidf_vectorizer.get_feature_names()
    tfidf_data = tfidf_vectorizer.transform(data.description.values).todense()
    tfidf_df = pd.DataFrame(columns=[tfidf_name for tfidf_name in tfidf_feature], data=tfidf_data)
    
    return tfidf_df

In [41]:
tfidf_df = description_encoder(data=data_copy)
tfidf_df.head(n=3)

Unnamed: 0,abducted,abduction,abilities,ability,aboard,abominable,absorbs,academy,accept,accepted,...,zombies,zombietown,zoo,zookeeper,zoologist,zooms,zoos,zootampa,zorro,zuzo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
renewal_data = pd.concat([genre_df, tfidf_df], axis=1)
renewal_data.head(n=3)

Unnamed: 0,Superhero,Romantic Comedy,Family,Action-Adventure,Crime,Animals & Nature,Talk Show,Coming of Age,Police/Cop,Dance,...,zombies,zombietown,zoo,zookeeper,zoologist,zooms,zoos,zootampa,zorro,zuzo
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- 선택한 컨텐츠 유사도 비교
 - cosine_similarity 사용

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarity_content(choice_title=None, data_copy=None, values_data=None):
    cosine_sim = cosine_similarity(values_data.values, values_data.values)
    title_to_index = dict(zip(data_copy['title'], data_copy.index))
    title_idx = title_to_index[choice_title]
    
    sim_score = list(enumerate(cosine_sim[title_idx]))
    sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)

    sim_movie_idx = [sim_idx[0] for sim_idx in sim_score[:11]]
    
    result = data_copy.iloc[sim_movie_idx]
    
    choice_content = np.array(result[result.title == choice_title])
    print('Choice content\n title : {title} \n {category_1} : {value_1} \n {category_2} : {value_2}'.format(
        title=choice_content[0][0], category_1 = list(data_copy)[1], category_2 = list(data_copy)[2],
        value_1=choice_content[0][1], value_2=choice_content[0][2]))
    
    return result[result.title != choice_title]

In [53]:
choice_title = input()

X-Men Origins: Wolverine


In [54]:
get_similarity_content(choice_title=choice_title, data_copy=data_copy, values_data=renewal_data)

Choice content
 title : X-Men Origins: Wolverine 
 genre : Action-Adventure, Family, Science Fiction 
 description : wolverine unites with legendary x men to fight against forces determined to eliminate mutants 


Unnamed: 0,title,genre,description
131,X-Men,"Action-Adventure, Family, Science Fiction",the x men a small band of outcast mutants figh...
477,X-Men: Days of Future Past,"Action-Adventure, Family, Science Fiction",characters from the x men trilogy join their y...
130,Fantastic Four,"Action-Adventure, Family, Science Fiction",four astronauts develop superpowers and must j...
117,X-Men: Dark Phoenix,"Action-Adventure, Family, Science Fiction",when jean grey transforms into the dark phoeni...
396,X2,"Action-Adventure, Family, Science Fiction",x men newcomer night crawler joins magneto to ...
377,The Mandalorian,"Action-Adventure, Family, Science Fiction",after the fall of the empire a lone gunfighter...
483,Race to Witch Mountain,"Action-Adventure, Family, Science Fiction",a las vegas cab driver helps two aliens disgui...
712,"20,000 Leagues Under the Sea","Action-Adventure, Family, Science Fiction",climb aboard the nautilus and into an undersea...
225,X-Men: The Last Stand,"Action-Adventure, Family, Science Fiction",when professor x clashes with magneto the batt...
227,Fantastic Four: Rise of the Silver Surfer,"Action-Adventure, Family, Science Fiction",the fantastic four meet the silver surfer who ...


#### 감독, 출연진 기반 추천
 - director, cast column 추출

In [34]:
direc_cast_data = data_null_check(data=data, choice_columns=['title','director', 'cast'])
direc_cast_data.head(n=3)

Unnamed: 0,title,director,cast
0,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN..."
1,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale"
2,Ice Age: A Mammoth Christmas,Karen Disher,"Raymond Albert Romano, John Leguizamo, Denis L..."


- 카테고리 데이터 인코딩
 - director, cast

In [35]:
direc_cast_df = to_vector(data=data_copy, choice_category=['director', 'cast'])
direc_cast_df.head(n=3)

Unnamed: 0,James Hayman,Angus MacLane,Sidney Beaumont,Dave Wasson,Bobs Gannaway,Maggie Greenwald,Jay Karas,Randal Kleiser,Riley Thomas,Alan Zaslove,...,Jerome Ranft,Matt Cornett,Bill Paxton,Laura Dern,Phillip Van Dyke,Scott Glenn,Jeanie Roberts,Kylie Leydon,Daveigh Chase,Andrew Allen
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- 선택한 컨텐츠 유사도 비교
 - cosine_similarity 사용

In [55]:
choice_title = input()

Ice Age: A Mammoth Christmas


In [36]:
get_similarity_content(choice_title=choice_title, data_copy=direc_cast_data, values_data=direc_cast_df)

Choice content
 title : Ice Age: A Mammoth Christmas 
 director : Karen Disher 
 cast : Raymond Albert Romano, John Leguizamo, Denis Leary, Queen Latifah


Unnamed: 0,title,director,cast
515,Ice Age: The Great Egg-Scapade,Ricardo Curtis,"Raymond Albert Romano, John Leguizamo, Denis L..."
71,Ice Age: The Meltdown,Carlos Saldanha,"Ray Romano, John Leguizamo, Denis Leary, Seann..."
225,Ice Age: Collision Course,Mike Thurmeier,"Ray Romano, John Leguizamo, Denis Leary, Adam ..."
874,Ice Age: Dawn Of The Dinosaurs,Carlos Saldanha,"Ray Romano, John Leguizamo, Denis Leary, Simon..."
346,The Wonderful World of Disney Presents The Lit...,Hamish Hamilton,"Auli'i Cravalho, Queen Latifah, Shaggy , John ..."
615,Operation Dumbo Drop,Simon Wincer,"Danny Glover, Ray Liotta, Denis Leary, Doug Do..."
74,Walking with Dinosaurs: The Movie,"Barry Cook, Neil Nightingale","John Leguizamo, Justin Long, Tiya Sircar, Skyl..."
0,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN..."
1,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale"
3,The Queen Family Singalong,Hamish Hamilton,"Darren Criss, Adam Lambert, Derek Hough, Alexa..."
