## Word2Vec 알고리즘


In [6]:
#!pip install gensim

In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

In [15]:
import warnings
warnings.filterwarnings(action='ignore')

In [16]:
# 경로의 경우 각자의 환경에 맞게 설정해주면 됩니다. 
path = './archive/'

In [17]:
movie = pd.read_csv(path + 'ratings.csv', low_memory=False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435


In [18]:
movie = movie.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,38150,1176,4.0,789652004
1,44717,1079,3.0,789652009
2,44717,47,5.0,789652009
3,44717,21,3.0,789652009
4,190860,21,5.0,822873600


In [19]:
# 영화의 Metadata를 불러와서 movieID에 맞는 TITLE을 구해줍니다. 
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [20]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [21]:
meta = meta.rename(columns={'id':'movieId'})
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId'] = meta['movieId'].astype(str)

movie = pd.merge(movie, meta[['movieId', 'original_title']], how='left', on='movieId')

In [22]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)

In [23]:
agg = movie.groupby(['userId'])['original_title'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Young and Innocent, Shuang ma lian huan, Cesa..."
2,"[La passion de Jeanne d'Arc, La belle et la bê..."
3,"[I Love You to Death, Once Were Warriors, Mons..."
4,"[Muxmäuschenstill, Batman & Robin, Hidalgo, 12..."
5,"[Star Trek III: The Search for Spock, The Curs..."


In [24]:
movie['original_title'].unique()

array(['The Endless Summer', 'Apocalypse Now', 'Finding Nemo', ...,
       'Lost River', 'Friends & Lovers', 'The Chechahcos'], dtype=object)

Word2vec 적용

In [25]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [26]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, size=20, window = 5, 
                           min_count=1, workers=4, iter=200, sg=1)

KeyboardInterrupt: 

In [None]:
embedding_model.wv.most_similar(positive=['Spider-Man 2'], topn=10)

## Doc2Vec 적용

![](https://drive.google.com/uc?export=view&id=1g2ausKfoaAT0jMwSatRUG3fiGWfDuysV
)

In [27]:
from gensim.models import doc2vec

In [28]:
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta = meta[meta['original_title'].notnull()].reset_index(drop=True)
meta = meta[meta['overview'].notnull()].reset_index(drop=True)

In [29]:
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import re 
stop_words = set(stopwords.words('english')) 

overview = []
for words in tqdm(meta['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[^A-Za-z0-9]+', ' ', str(word_tokens))
    sentence = sentence.strip()
    
    sentence_tokens = word_tokenize(sentence)
    result = ''
    for token in sentence_tokens: 
        if token not in stop_words:
            result += ' ' + token 
    result = result.strip().lower()
    overview.append(result)

HBox(children=(FloatProgress(value=0.0, max=44512.0), HTML(value='')))




In [30]:
meta['pre_overview'] = overview

In [32]:
meta['pre_overview'][0]

'led woody andy toys live happily room andy birthday brings buzz lightyear onto scene afraid losing place andy heart woody plots buzz but circumstances separate buzz woody owner duo eventually learns put aside differences'

In [33]:
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [34]:
from collections import namedtuple

agg = meta[['id', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for d, c in agg[['original_title', 'pre_overview']].values]

In [35]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [36]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


During Time: 456.26608324050903


In [37]:
doc_vectorizer.docvecs.most_similar('Toy Story', topn=20)

[('Children in the Surf at Coney Island', 0.7027300596237183),
 ('Kader', 0.6990302205085754),
 ('It Stains the Sands Red', 0.693947434425354),
 ('A Hound for Trouble', 0.6880925893783569),
 ('Letzte Worte', 0.6843924522399902),
 ('Особенности национальной политики', 0.6778054237365723),
 ('Due Amici', 0.6556452512741089),
 ('Once Were Warriors', 0.6540728807449341),
 ('El vendedor de humo', 0.6505322456359863),
 ('По следам бременских музыкантов', 0.6475106477737427),
 ('Live Forever as You Are Now with Alan Resnick', 0.6472793817520142),
 ('Der Sandmann', 0.6463786363601685),
 ('Stryapukha', 0.6445901393890381),
 ('Przechodzien', 0.6437491774559021),
 ('La moutarde me monte au nez', 0.6422765254974365),
 ('Wild Roomies', 0.6408451795578003),
 ("Schindler's List", 0.6339589953422546),
 ('Milk Money', 0.6307691335678101),
 ('Bitter Moon', 0.6303579807281494),
 ('Trois vies et une seule mort', 0.6243611574172974)]

In [38]:
doc_vectorizer.docvecs.most_similar('Harry Potter and the Deathly Hallows: Part 1', topn=20)

[('Just Like Us', 0.6937873363494873),
 ('The Tillman Story', 0.673500120639801),
 ('Never Let Me Go', 0.6648300886154175),
 ('The Great Ecstasy of Robert Carmichael', 0.6527705192565918),
 ('Nickelodeon', 0.6494117975234985),
 ('Bajo las estrellas', 0.6471505165100098),
 ('Fantasma', 0.644270122051239),
 ('$ Dollars', 0.6406192779541016),
 ('The Joneses', 0.6403496861457825),
 ('The Black Rose', 0.6379349827766418),
 ('Classe tous risques', 0.6260645985603333),
 ('Il deserto dei Tartari', 0.6174139380455017),
 ('A londoni férfi', 0.6170987486839294),
 ('Der Räuber', 0.6164187788963318),
 ('Zamilované Maso', 0.6141735315322876),
 ('Bunny and the Bull', 0.6057607531547546),
 ('Day Dreams', 0.604948878288269),
 ('Shadows & Lies', 0.6028825044631958),
 ('Les trois couronnes du matelot', 0.595940887928009),
 ('Skidoo', 0.5928267240524292)]