<a href="https://colab.research.google.com/github/artist-code/homework/blob/main/movie_homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 데이터 로딩
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

url1 = 'https://raw.githubusercontent.com/sd12832/MoViZ/master/src/tmdb_5000_credits.csv'
url2 = 'https://raw.githubusercontent.com/sd12832/MoViZ/master/src/tmdb_5000_movies.csv'

df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2)

df1.columns = ['id', 'title', 'cast', 'crew']
df = df2.merge(df1, on='title')
df.shape

(4809, 23)

In [None]:
C = df['vote_average'].mean()
m = df['vote_count'].quantile(0.9)

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

df['score'] = df.apply(weighted_rating, axis=1)

# 영어권
top_en = df[df['original_language'] == 'en'].sort_values('score', ascending=False).head(10)[['title', 'score']]
print('영어권 추천:')
display(top_en)

# 비영어권
top_non_en = df[df['original_language'] != 'en'].sort_values('score', ascending=False).head(10)[['title', 'score']]
print('비영어권 추천:')
display(top_non_en)

영어권 추천:


Unnamed: 0,title,score
1883,The Shawshank Redemption,8.058576
662,Fight Club,7.938689
65,The Dark Knight,7.919564
3235,Pulp Fiction,7.904036
96,Inception,7.862848
3340,The Godfather,7.850454
95,Interstellar,7.809015
809,Forrest Gump,7.802587
329,The Lord of the Rings: The Return of the King,7.726679
1992,The Empire Strikes Back,7.697175


비영어권 추천:


Unnamed: 0,title,score
2296,Spirited Away,7.584321
4308,"The Good, the Bad and the Ugly",7.209556
1261,Amélie,7.189478
1989,Howl's Moving Castle,7.187162
2249,Princess Mononoke,7.185044
3871,City of God,7.088515
3945,Oldboy,7.072145
2467,Pan's Labyrinth,7.031297
3625,Once Upon a Time in the West,6.854902
4541,Seven Samurai,6.772748


In [None]:
df['overview'] = df['overview'].fillna('')
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df.index, index=df['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

# 예시
get_recommendations('The Dark Knight')

Unnamed: 0,title
3,The Dark Knight Rises
428,Batman Returns
3859,"Batman: The Dark Knight Returns, Part 2"
299,Batman Forever
1360,Batman
1361,Batman
119,Batman Begins
1182,JFK
9,Batman v Superman: Dawn of Justice
2509,Slow Burn


In [None]:
# 안전한 literal_eval
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return []

# 전처리 함수들
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        return []

def get_director(x):
    for i in x:
        if i.get('job') == 'Director':
            return i.get('name', '').lower().replace(" ", "")
    return ''

# 컬럼 파싱
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(safe_literal_eval)

df['director'] = df['crew'].apply(get_director)
for feature in ['cast', 'keywords', 'genres']:
    df[feature] = df[feature].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
    df[feature] = df[feature].apply(clean_data)
df['director'] = df['director'].fillna('')
df['soup'] = df['keywords'] + df['cast'] + df['genres'] + df['director'].apply(lambda x: [x])
df['soup'] = df['soup'].apply(lambda x: ' '.join(x))

# 코사인 유사도 계산
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

# 예시
get_recommendations('Inception', cosine_sim2)

Unnamed: 0,title
4407,The Helix... Loaded
1934,Sheena
607,Sky Captain and the World of Tomorrow
1449,Knock Off
1002,Street Fighter: The Legend of Chun-Li
4595,Fabled
1274,Extreme Ops
1717,Timecop
4740,Echo Dr.
495,Journey 2: The Mysterious Island
