In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import pandas as pd
import numpy as np
from ast import literal_eval #parse the stringified features into their corresponding python objects
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import warnings; warnings.simplefilter('ignore')

In [46]:
%%time
movies_metadata = pd.read_csv('drive/MyDrive/the_movies_dataset/movies_metadata.csv')

credits = pd.read_csv('drive/MyDrive/the_movies_dataset/credits.csv')
keywords = pd.read_csv('drive/MyDrive/the_movies_dataset/keywords.csv')

df = movies_metadata[['id', 'title', 'production_companies', 'genres', 'overview', 'tagline']]

df = df.drop([19730, 29503, 35587])
df.id = df.id.astype(int)
credits.id = credits.id.astype(int)
keywords.id = keywords.id.astype('int')

df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')

df['title'] = df['title'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

df['overview'] = df['overview'].fillna('')

df['tagline'] = df['tagline'].fillna('')

df['genres'] = df['genres'].fillna('[]').apply(literal_eval) \
                           .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) \
                           .apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

df['production_companies']= df['production_companies'].fillna('[]') \
                                                      .apply(literal_eval) \
                                                      .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) \
                                                      .apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

df['crew'] = df['crew'].apply(literal_eval).apply(get_director) \
                       .astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

df['keywords'] = df['keywords'].apply(literal_eval) \
                               .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

df['cast'] = df['cast'].apply(literal_eval) \
                       .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) \
                       .apply(lambda x: x[:5] if len(x) >=5 else x) \
                       .apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

s = df.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

df['keywords'] = df['keywords'].apply(filter_keywords) \
                               .apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

df.head().transpose()

CPU times: user 1min 17s, sys: 1.54 s, total: 1min 19s
Wall time: 1min 22s


Unnamed: 0,0,1,2,3,4
id,862,8844,15602,31357,11862
title,toystory,jumanji,grumpieroldmen,waitingtoexhale,fatherofthebridepartii
production_companies,[pixaranimationstudios],"[tristarpictures, teitlerfilm, interscopecommu...","[warnerbros., lancastergate]",[twentiethcenturyfoxfilmcorporation],"[sandollarproductions, touchstonepictures]"
genres,"[animation, comedy, family]","[adventure, fantasy, family]","[romance, comedy]","[comedy, drama, romance]",[comedy]
overview,"Led by Woody, Andy's toys live happily in his ...",When siblings Judy and Peter discover an encha...,A family wedding reignites the ancient feud be...,"Cheated on, mistreated and stepped on, the wom...",Just when George Banks has recovered from his ...
tagline,,Roll the dice and unleash the excitement!,Still Yelling. Still Fighting. Still Ready for...,Friends are the people who let you be yourself...,Just When His World Is Back To Normal... He's ...
cast,"[tomhanks, timallen, donrickles, jimvarney, wa...","[robinwilliams, jonathanhyde, kirstendunst, br...","[waltermatthau, jacklemmon, ann-margret, sophi...","[whitneyhouston, angelabassett, lorettadevine,...","[stevemartin, dianekeaton, martinshort, kimber..."
crew,johnlasseter,joejohnston,howarddeutch,forestwhitaker,charlesshyer
keywords,"[jealousy, toy, boy, friendship, friends, riva...","[boardgame, disappearance, basedonchildren'sbo...","[fishing, bestfriend, duringcreditsstinger, ol...","[basedonnovel, interracialrelationship, single...","[baby, midlifecrisis, confidence, aging, daugh..."


In [47]:
%%time
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

VERB_CODES = {'VB',  # Verb, base form
              'VBD',  # Verb, past tense
              'VBG',  # Verb, gerund or present participle
              'VBN',  # Verb, past participle
              'VBP',  # Verb, non-3rd person singular present
              'VBZ',}  # Verb, 3rd person singular present

def preprocess_sentences(text):
    text = text.lower()
    temp_sent =[]
    words = nltk.word_tokenize(text)
    tags = nltk.pos_tag(words)
    for i, word in enumerate(words):
        if tags[i][1] in VERB_CODES:
            lemmatized = lemmatizer.lemmatize(word, 'v')
        else:
            lemmatized = lemmatizer.lemmatize(word)
        if lemmatized not in stop_words and lemmatized.isalpha():
            temp_sent.append(lemmatized)

    finalsent = ' '.join(temp_sent)
    finalsent = finalsent.replace("n't", " not")
    finalsent = finalsent.replace("'m", " am")
    finalsent = finalsent.replace("'s", " is")
    finalsent = finalsent.replace("'re", " are")
    finalsent = finalsent.replace("'ll", " will")
    finalsent = finalsent.replace("'ve", " have")
    finalsent = finalsent.replace("'d", " would")
    return finalsent

df['preprocessed_overview'] = df['overview'].apply(preprocess_sentences)
df['preprocessed_tagline'] = df['tagline'].apply(preprocess_sentences)

df[['preprocessed_overview', 'preprocessed_tagline']].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CPU times: user 3min 35s, sys: 2.16 s, total: 3min 37s
Wall time: 3min 46s


Unnamed: 0,preprocessed_overview,preprocessed_tagline
0,lead woody andy toy live happily room andy bir...,
1,sibling judy peter discover enchant board game...,roll dice unleash excitement
2,family wedding reignite ancient feud neighbor ...,still yell still fight still ready love
3,cheat mistreat step woman hold breath wait elu...,friend people let never let forget
4,george bank recover daughter wedding receive n...,world back normal surprise life


In [48]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
id,862,8844,15602,31357,11862
title,toystory,jumanji,grumpieroldmen,waitingtoexhale,fatherofthebridepartii
production_companies,[pixaranimationstudios],"[tristarpictures, teitlerfilm, interscopecommu...","[warnerbros., lancastergate]",[twentiethcenturyfoxfilmcorporation],"[sandollarproductions, touchstonepictures]"
genres,"[animation, comedy, family]","[adventure, fantasy, family]","[romance, comedy]","[comedy, drama, romance]",[comedy]
overview,"Led by Woody, Andy's toys live happily in his ...",When siblings Judy and Peter discover an encha...,A family wedding reignites the ancient feud be...,"Cheated on, mistreated and stepped on, the wom...",Just when George Banks has recovered from his ...
tagline,,Roll the dice and unleash the excitement!,Still Yelling. Still Fighting. Still Ready for...,Friends are the people who let you be yourself...,Just When His World Is Back To Normal... He's ...
cast,"[tomhanks, timallen, donrickles, jimvarney, wa...","[robinwilliams, jonathanhyde, kirstendunst, br...","[waltermatthau, jacklemmon, ann-margret, sophi...","[whitneyhouston, angelabassett, lorettadevine,...","[stevemartin, dianekeaton, martinshort, kimber..."
crew,johnlasseter,joejohnston,howarddeutch,forestwhitaker,charlesshyer
keywords,"[jealousy, toy, boy, friendship, friends, riva...","[boardgame, disappearance, basedonchildren'sbo...","[fishing, bestfriend, duringcreditsstinger, ol...","[basedonnovel, interracialrelationship, single...","[baby, midlifecrisis, confidence, aging, daugh..."
preprocessed_overview,lead woody andy toy live happily room andy bir...,sibling judy peter discover enchant board game...,family wedding reignite ancient feud neighbor ...,cheat mistreat step woman hold breath wait elu...,george bank recover daughter wedding receive n...


In [51]:
idxs = ['keywords', 'cast', 'genres', 'production_companies', 'preprocessed_tagline', 'preprocessed_overview', 'crew', 'title']
for idx in idxs:
    df[idx] = df[idx].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else str(x))

def concat_columns(row):
    return ' '.join(row)

df['soup'] = df[idxs].apply(concat_columns, axis=1)

df['soup'][0]

'jealousy toy boy friendship friends rivalry boynextdoor newtoy toycomestolife tomhanks timallen donrickles jimvarney wallaceshawn animation comedy family pixaranimationstudios  lead woody andy toy live happily room andy birthday brings buzz lightyear onto scene afraid lose place andy heart woody plot buzz circumstance separate buzz woody owner duo eventually learn put aside difference johnlasseter toystory'

Так как во время использования не будет доступна информация о всех прошлых оценках пользователя (недостаток информации или cold start) было решено сделать рекомендательную систему на основе сходства фильмов и их описаний между собой, т.е. применить content based подход.

In [52]:
df.to_csv('/content/drive/MyDrive/the_movies_dataset/preprocessed_movies_metadata.csv', index=False)

\#TODO: \
- сделать возможность для пользователя оценить 10 фильмов перед получением рекомендаций, чтобы учесть персональный аспект.
- сделать возможным выбора сразу несколько фильмов для рекомендаций.
- не рекомендовать другие фильмы из серии.