In [1]:
#https://arxiv.org/abs/1405.4053
#https://radimrehurek.com/gensim/models/doc2vec.html
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html
#https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5

In [2]:
import pandas as pd

In [3]:
df_train = pd.read_parquet('./data/movie-genre-prediction/train.parquet', engine='pyarrow')
df_test = pd.read_parquet('./data/movie-genre-prediction/test.parquet', engine='pyarrow')

In [4]:
df_train.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [5]:
df_train.shape

(54000, 4)

In [6]:
genres_list = list(df_train.genre.unique())
genres_list

['fantasy',
 'horror',
 'family',
 'scifi',
 'action',
 'crime',
 'adventure',
 'mystery',
 'romance',
 'thriller']

In [7]:
df_test.shape

(36000, 4)

In [8]:
df_train["text"] = df_train.movie_name.str.lower() + ". " + df_train.synopsis.str.lower()
df_train.head()

Unnamed: 0,id,movie_name,synopsis,genre,text
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy,super me. a young scriptwriter starts bringing...
1,50185,Entity Project,A director and her friends renting a haunted h...,horror,entity project. a director and her friends ren...
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family,behavioral family therapy for serious psychiat...
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi,blood glacier. scientists working in the austr...
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action,apat na anino. buy day - four men widely - apa...


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df_train["text"], 
                                                  df_train["genre"], 
                                                  test_size=0.30, 
                                                  stratify=df_train["genre"],
                                                  random_state=42)

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hernanamiune/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
print(word_tokenize(df_train.iloc[0,:]["text"]))

['super', 'me', '.', 'a', 'young', 'scriptwriter', 'starts', 'bringing', 'valuable', 'objects', 'back', 'from', 'his', 'short', 'nightmares', 'of', 'being', 'chased', 'by', 'a', 'demon', '.', 'selling', 'them', 'makes', 'him', 'rich', '.']


In [56]:
X_train_tokens = [TaggedDocument(words=word_tokenize(d), 
                              tags=[i]) for i, d in enumerate(X_train)]

In [57]:
X_train_tokens[0]

TaggedDocument(words=['iron', 'man', '.', 'after', 'being', 'held', 'captive', 'in', 'an', 'afghan', 'cave', ',', 'billionaire', 'engineer', 'tony', 'stark', 'creates', 'a', 'unique', 'weaponized', 'suit', 'of', 'armor', 'to', 'fight', 'evil', '.'], tags=[0])

In [58]:
model = Doc2Vec(vector_size=2000,min_count=2,epochs=40,window=20)

In [59]:
model.build_vocab(X_train_tokens)

In [60]:
from gensim.models.callbacks import CallbackAny2Vec
class callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 1
    def on_epoch_end(self, model):
        if self.epoch % 1 == 0:
            print(f"Epoch {self.epoch} finished")
        self.epoch += 1

In [61]:
model.train(X_train_tokens,
            total_examples=model.corpus_count, 
            epochs=model.epochs,
            callbacks=[callback()])
print("Train finished")

Epoch 1 finished
Epoch 2 finished
Epoch 3 finished
Epoch 4 finished
Epoch 5 finished
Epoch 6 finished
Epoch 7 finished
Epoch 8 finished
Epoch 9 finished
Epoch 10 finished
Epoch 11 finished
Epoch 12 finished
Epoch 13 finished
Epoch 14 finished
Epoch 15 finished
Epoch 16 finished
Epoch 17 finished
Epoch 18 finished
Epoch 19 finished
Epoch 20 finished
Epoch 21 finished
Epoch 22 finished
Epoch 23 finished
Epoch 24 finished
Epoch 25 finished
Epoch 26 finished
Epoch 27 finished
Epoch 28 finished
Epoch 29 finished
Epoch 30 finished
Epoch 31 finished
Epoch 32 finished
Epoch 33 finished
Epoch 34 finished
Epoch 35 finished
Epoch 36 finished
Epoch 37 finished
Epoch 38 finished
Epoch 39 finished
Epoch 40 finished
Train finished


In [62]:
len(model.dv)

37800

In [63]:
model.save("d2v_vs2000_w20.model")
print("Modelo Guardado")

Modelo Guardado


In [64]:
model.dv[0][0:10]

array([ 0.07417507,  0.02801843,  0.15312947,  0.07937381,  0.03501932,
        0.02198828, -0.13884205,  0.0979145 ,  0.08372234, -0.12045726],
      dtype=float32)

In [65]:
X_train_tokens2 = X_train.apply(word_tokenize)

In [66]:
X_train_vect = X_train_tokens2.apply(model.infer_vector)

In [67]:
X_train_vect = np.array([np.array(v) for v in X_train_vect])

In [68]:
X_train_vect.shape

(37800, 2000)

In [69]:
"""
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])
"""

'\nranks = []\nsecond_ranks = []\nfor doc_id in range(len(train_corpus)):\n    inferred_vector = model.infer_vector(train_corpus[doc_id].words)\n    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))\n    rank = [docid for docid, sim in sims].index(doc_id)\n    ranks.append(rank)\n\n    second_ranks.append(sims[1])\n'

In [70]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000).fit(X_train_vect, y_train)
clf.predict(X_train_vect[0:2,:])

array(['scifi', 'crime'], dtype=object)

In [71]:
from sklearn.metrics import accuracy_score
y_tain_pred = clf.predict(X_train_vect)
print("Train accuracy: ", accuracy_score(y_train, y_tain_pred))

Train accuracy:  0.3315873015873016


In [78]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train_vect)
X_train_vect = scaler.transform(X_train_vect)

In [80]:
clf = LogisticRegression(max_iter=5000).fit(X_train_vect, y_train)
y_tain_pred = clf.predict(X_train_vect)
print("Train accuracy: ", accuracy_score(y_train, y_tain_pred))

Train accuracy:  0.358994708994709


In [72]:
X_val_tokens = X_val.apply(word_tokenize)

In [73]:
X_val_vect = X_val_tokens.apply(model.infer_vector)

In [74]:
X_val_vect = np.array([np.array(v) for v in X_val_vect])

In [75]:
X_val_vect.shape

(16200, 2000)

In [76]:
y_val_pred = clf.predict(X_val_vect)
print("Validation accuracy: ", accuracy_score(y_val, y_val_pred))

Validation accuracy:  0.3052469135802469


In [81]:
X_val_vect = scaler.transform(X_val_vect)
y_val_pred = clf.predict(X_val_vect)
print("Validation accuracy: ", accuracy_score(y_val, y_val_pred))

Validation accuracy:  0.30493827160493825
