In [1]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

doc1 = np.array([0,1,1,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,2])

print(cos_sim(doc1, doc2))
print(cos_sim(doc1, doc3))
print(cos_sim(doc2, doc3))

0.6666666666666667
0.6666666666666667
1.0000000000000002


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv('dataset/movies/movies_metadata.csv', low_memory=False)
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [4]:
print(len(data))
print(data['overview'].isnull().sum())

45466
954


In [5]:
# 결측값을 빈 값으로 대체
data['overview'] = data['overview'].fillna('')
print(data['overview'].isnull().sum())

0


In [6]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])
print(tfidf_matrix.shape)

(45466, 75827)


In [8]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)

(45466, 45466)


In [12]:
title_to_index = dict(zip(data['title'], data.index))
print(len(title_to_index.keys()))
print(list(title_to_index.keys())[:5])
print(list(title_to_index.values())[:5])

# 영화 제목 Father of the Bride Part II의 인덱스를 리턴
idx = title_to_index['Father of the Bride Part II']
print(idx)

42278
['Toy Story', 'Jumanji', 'Grumpier Old Men', 'Waiting to Exhale', 'Father of the Bride Part II']
[0, 1, 2, 3, 4]
4


In [28]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # 선택한 영화의 타이틀로부터 해당 영화의 인덱스를 받아온다.
    idx = title_to_index[title]

    # 해당 영화와 모든 영화와의 유사도를 가져온다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 영화들을 정렬한다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 20개의 영화를 받아온다.
    # 0번째는 자기 자신 (제일 유사함)
    sim_scores = sim_scores[1:21]

    # 가장 유사한 20개의 영화의 인덱스를 얻는다.
    movie_indices = [idx[0] for idx in sim_scores]

    # 가장 유사한 20개의 영화의 제목을 리턴한다.
    return data['title'].iloc[movie_indices]

In [29]:
recommendations = get_recommendations('The Dark Knight Rises')
print(recommendations)
print(type(recommendations))

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
35983                                    Batman: Bad Blood
20232              Batman: The Dark Knight Returns, Part 2
40974    LEGO DC Comics Super Heroes: Batman: Be-Leaguered
10122                                        Batman Begins
41982    Batman Beyond Darwyn Cooke's Batman 75th Anniv...
29536                   Batman Unlimited: Animal Instincts
41976                                The Lego Batman Mov

In [30]:
import numpy as np

def dist(x,y):   
    return np.sqrt(np.sum((x-y)**2))

doc1 = np.array((2,3,0,1))
doc2 = np.array((1,2,3,1))
doc3 = np.array((2,1,2,2))
docQ = np.array((1,1,0,1))

print(dist(doc1,docQ))
print(dist(doc2,docQ))
print(dist(doc3,docQ))

2.23606797749979
3.1622776601683795
2.449489742783178


In [31]:
doc1 = "apple banana everyone like likey watch card holder"
doc2 = "apple banana coupon passport love you"

# 토큰화
tokenized_doc1 = doc1.split()
tokenized_doc2 = doc2.split()

print(tokenized_doc1)
print(tokenized_doc2)

['apple', 'banana', 'everyone', 'like', 'likey', 'watch', 'card', 'holder']
['apple', 'banana', 'coupon', 'passport', 'love', 'you']


In [33]:
union = set(tokenized_doc1).union(set(tokenized_doc2))
print(union)
print(type(union))

{'card', 'everyone', 'passport', 'banana', 'you', 'watch', 'coupon', 'like', 'holder', 'likey', 'apple', 'love'}
<class 'set'>


In [34]:
intersection = set(tokenized_doc1).intersection(set(tokenized_doc2))
print(intersection)
print(type(intersection))

{'apple', 'banana'}
<class 'set'>


In [35]:
print(len(intersection)/len(union))

0.16666666666666666
