In [1]:
import gensim
gensim.__version__



'3.6.0'

In [2]:
import string
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
import annoy
from gensim.models import Word2Vec, FastText
import pickle
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.metrics import accuracy_score

In [5]:
from preprocess_func import morpher, sw, exclude, preprocess_txt, embed_txt

In [6]:
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [7]:
movie_dataset = pd.read_csv("../data/movie_dataset.csv")

In [8]:
movie_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [9]:
movie_dataset.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [10]:
movie_prepared = movie_dataset.drop(labels=['budget', 'homepage', 'id','original_language', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'vote_average', 'vote_count','crew'],
                                    axis=1)

In [11]:
movie_prepared.head(2)

Unnamed: 0,index,genres,keywords,original_title,overview,title,cast,director
0,0,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Avatar,"In the 22nd century, a paraplegic Marine is di...",Avatar,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron
1,1,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski


In [12]:
movie_prepared['text'] = movie_prepared["keywords"] + \
                        " " + movie_prepared["cast"]
movie_prepared['text'] = movie_prepared['text'].apply(lambda x: preprocess_txt(str(x)))
movie_prepared.head(2)

Unnamed: 0,index,genres,keywords,original_title,overview,title,cast,director,text
0,0,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Avatar,"In the 22nd century, a paraplegic Marine is di...",Avatar,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron,"[culture, clash, future, space, war, space, co..."
1,1,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski,"[ocean, drug, abuse, exotic, island, east, ind..."


In [13]:
print(movie_prepared['text'].iloc[2])

['spy', 'based', 'on', 'novel', 'secret', 'agent', 'sequel', 'mi6', 'daniel', 'craig', 'christoph', 'waltz', 'lu00e9a', 'seydoux', 'ralph', 'fiennes', 'monica', 'bellucci']


In [14]:
movie_link = pd.DataFrame({'movie_title': movie_dataset['title'], 'homepage': movie_dataset['homepage']})

In [15]:
movie_link.head(3)

Unnamed: 0,movie_title,homepage
0,Avatar,http://www.avatarmovie.com/
1,Pirates of the Caribbean: At World's End,http://disney.go.com/disneypictures/pirates/
2,Spectre,http://www.sonypictures.com/movies/spectre/


In [16]:
modelFT = FastText(sentences=movie_prepared['text'].values, size=100, min_count=1, window=5) # gensim == 4.3.1 vector_size
modelFT.save("../data/bot_trained/ft_model_imdb")

In [17]:
modelFT = FastText.load("../data/bot_trained/ft_model_imdb")

In [18]:
with open("../data/bot_trained/index_speaker.pkl","rb") as f:
    bin_data = f.read()
    index_loaded = pickle.loads(bin_data)

In [19]:
with open("../data/bot_trained/index_anecd.pkl","rb") as f:
    bin_data = f.read()
    anecd_index_loaded = pickle.loads(bin_data)

In [20]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

In [21]:
len(movie_prepared['text'].values)//2

2401

In [22]:
len(anecd_index_loaded)

10002

In [23]:
idxs = set(np.random.randint(0, len(index_loaded), (len(movie_prepared['text'].values)//2)))
idxs_anecd = set(np.random.randint(0, len(anecd_index_loaded), (len(movie_prepared['text'].values)//2)))
negative_speaker_texts = [" ".join(preprocess_txt(index_loaded[i])) for i in idxs]
negative_anecd_texts = [" ".join(preprocess_txt(anecd_index_loaded[i])) for i in idxs_anecd]
positive_texts = [" ".join(val) for val in movie_prepared['text'].values]

In [24]:
print(len(negative_speaker_texts), len(negative_anecd_texts), len(positive_texts))

2351 2147 4803


In [26]:
dataset = negative_speaker_texts + negative_anecd_texts + positive_texts
labels = np.zeros(len(dataset))
labels[len(negative_speaker_texts) + len(negative_anecd_texts):] = np.ones(len(positive_texts))

In [27]:
print(len(dataset))
print(len(labels))

9301
9301


In [28]:
X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.2, stratify=labels,
                                                    random_state=13)

In [29]:
x_train_vec = vectorizer.fit_transform(X_train)
x_test_vec = vectorizer.transform(X_test)


In [30]:
X_train[9]

'биография заря свой юность борис моисеев вставать петух'

In [31]:
x_test_vec.shape

(1861, 158093)

In [32]:
lr_imdb = LogisticRegression(max_iter=100).fit(x_train_vec, y_train)

In [33]:
accuracy_score(y_true=y_test, y_pred=lr_imdb.predict(x_test_vec))

0.9854916711445459

In [34]:
# vectorizing for annoy
tfidf_vect_imdb = TfidfVectorizer().fit(positive_texts)

In [35]:
with open('../data/bot_trained/lr_vect_tfidf_imdb.pkl', 'wb') as fout:
    pickle.dump((vectorizer, tfidf_vect_imdb, lr_imdb), fout)

In [36]:
np.mean(tfidf_vect_imdb.idf_)

8.167952409947063

In [37]:
# idfs_imdb, midf_imdb

In [38]:
idfs_imdb = {v[0]: v[1] for v in zip(tfidf_vect_imdb.vocabulary_, tfidf_vect_imdb.idf_)}


In [39]:
print(list(idfs_imdb.keys())[10:21])
print(list(idfs_imdb.values())[10:21])

['saldana', 'sigourney', 'weaver', 'stephen', 'lang', 'michelle', 'rodriguez', 'ocean', 'drug', 'abuse', 'exotic']
[5.8662862705556496, 8.78405700263993, 8.78405700263993, 8.78405700263993, 8.378591894531764, 8.78405700263993, 8.78405700263993, 5.92185612171046, 7.867766270765774, 8.378591894531764, 8.78405700263993]


In [40]:
# creating indexes for anecdote texts

ft_index_imdb = annoy.AnnoyIndex(100 ,'angular')

midf_imdb = np.mean(tfidf_vect_imdb.idf_)

index_map_imdb = {}
counter = 0

for i in tqdm(range(len(movie_prepared['text'].values))):
    n_ft = 0
    index_map_imdb[counter] = (movie_link.loc[i, "movie_title"], movie_link.loc[i, "homepage"])
    vector_ft = np.zeros(100)
    for word in movie_prepared['text'].values[i]:
        if word in modelFT.wv:
            vector_ft += modelFT.wv[word] * idfs_imdb.get(word, midf_imdb)
            n_ft += idfs_imdb.get(word, midf_imdb)
    if n_ft > 0:
        vector_ft = vector_ft / n_ft
    ft_index_imdb.add_item(counter, vector_ft)
    counter += 1

ft_index_imdb.build(10)
ft_index_imdb.save('../data/bot_trained/imdb.ann')

  0%|          | 0/4803 [00:00<?, ?it/s]

True

In [41]:
with open("../data/bot_trained/index_imdb.pkl", "wb") as f:
    pickle.dump(index_map_imdb, f)

In [42]:
ft_index_imdb.get_nns_by_vector(np.zeros(100), 1, include_distances=True)

([90], [1.4142135381698608])

In [43]:
with open("../data/bot_trained/index_imdb.pkl", "rb") as db:
    index_imdb_loaded = pickle.load(db)

In [44]:
index_imdb_loaded[5]

('Spider-Man 3', 'http://www.sonypictures.com/movies/spider-man3/')