# Movies Dataset Engineering

This notebook loads csv files from The Movies Dataset from Kaggle and creates a number of pickle objects that are used to build the natural language movie recommender app. These pickle objects allow the recommender app to minimize use of pandas library and operate quickly using numpy. This notebook uses the movies_metadata.csv, credits.csv, and keywords.csv. Pickle objects produced include:
1. actor_list.pkl: List of actors which appear in at least 10 movies. Used to search user input for actor names.
2. genre_list.pkl: List of movie genres. Used to search user input for mentions of a particular genre.
3. movie_list.pkl: List of movie titles used to search user input. Release year has been removed from movie titles. Movie titles which are also stopwords or genres have been removed.
4. movie_array.pkl: Numpy array of movie properties used to generate recommendations. First 20 columns represent topic distribution of the movie description and keywords. All other columns are a sparse representation of actors and genres.
5. norms.pkl: Pre-computed norms of topic distribution from each movie
6. col_names.pkl: Names of columns in movie_array
7. movie_names.pkl: Names of rows in movie_array
8. movie_metadata.pkl: Pandas dataframe containing metadata information for every movie in movie_array

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import pickle
from collections import Counter
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

from gensim.parsing.preprocessing import STOPWORDS

In [2]:
movies = pd.read_csv('the-movies-dataset/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [4]:
movies['title_type'] = movies['title'].apply(type)

In [5]:
movies = movies[movies['title_type'] != float]

In [6]:
credits = pd.read_csv('the-movies-dataset/credits.csv')

In [7]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [8]:
keywords = pd.read_csv('the-movies-dataset/keywords.csv')

In [9]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


### Create lists of actors, movies, and genres
Pickle these lists to use them for NER

In [10]:
# Create a list of all actors that appear in at least 10 movies

def update_actor_counts(credits):
    credits = literal_eval(credits)
    actors = []
    for actor in credits:
        actors.append(actor['name'])
    actor_counts.update(actors)

actor_counts = Counter()

dump = credits['cast'].apply(update_actor_counts)

pop_actors = []
for actor in actor_counts.keys():
    if actor_counts[actor] >= 10:
        pop_actors.append(actor)

pickle.dump(pop_actors, open('pickles/actor_list.pkl', 'wb'))

In [11]:
len(pop_actors)

10531

In [12]:
# Filter out movies with less than 25 reviews and less than a 4 average rating

movies = movies[movies['vote_count'] > 25]
movies = movies[movies['vote_average'] > 4]
movies = movies[movies['status'] == 'Released']
movies.reset_index(inplace = True, drop = True)

# movies dataframe is now much smaller
movies.shape

(13270, 25)

In [None]:
movies.loc[0, 'genres']

def list_genres(genres):
    g_list = []
    genres = literal_eval(genres)
    for genre in genres:
        g_list.append(genre['name'])
    return g_list

movies['genres'] = movies['genres'].apply(list_genres)

In [79]:
# Create a list of all movie genres

genre_list = []

def get_genres(movie):
    #movie = literal_eval(movie)
    for genre in movie:
        genre_list.append(genre.lower())
        
dump = movies['genres'].apply(get_genres)

genre_list = list(set(genre_list))



In [81]:
pickle.dump(genre_list, open('pickles/genre_list.pkl', 'wb'))

In [14]:
# Create a list of all movies

movie_list = list(movies['title'])

for i, movie in enumerate(movie_list):
    if len(movie) == 1 or movie.lower() in STOPWORDS:
        movie_list.pop(i)
        
movie_list = list(set(movie_list) - set(genre_list).intersection(set(movie_list)))

pickle.dump(movie_list, open('pickles/movie_list.pkl', 'wb'))

### Eliminate unnecessary columns of movie dataframe

In [15]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'title_type'],
      dtype='object')

In [16]:
keep_cols = ['genres', 'id', 'overview', 'release_date', 'runtime', 'tagline', 'title', 'vote_average', 'vote_count']
movies = movies[keep_cols]
movies['id'] = pd.to_numeric(movies['id'])
movies.head()

Unnamed: 0,genres,id,overview,release_date,runtime,tagline,title,vote_average,vote_count
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,81.0,,Toy Story,7.7,5415.0
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,1995-12-15,104.0,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,1995-12-22,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,127.0,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Just when George Banks has recovered from his ...,1995-02-10,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0


### Merge keywords and actors into movie dataframe

In [17]:
literal_eval(keywords.loc[0,'keywords'])

[{'id': 931, 'name': 'jealousy'},
 {'id': 4290, 'name': 'toy'},
 {'id': 5202, 'name': 'boy'},
 {'id': 6054, 'name': 'friendship'},
 {'id': 9713, 'name': 'friends'},
 {'id': 9823, 'name': 'rivalry'},
 {'id': 165503, 'name': 'boy next door'},
 {'id': 170722, 'name': 'new toy'},
 {'id': 187065, 'name': 'toy comes to life'}]

In [18]:
# Merge keywords with movie dataframe

def extract_keywords(row):
    kw_dict = literal_eval(row)
    kw_str = ''
    for kw in kw_dict:
        kw_str += ' ' + kw['name']
    return kw_str.strip()

        
keywords['string'] = keywords['keywords'].apply(extract_keywords)

In [19]:
movies = pd.merge(movies, keywords[['string', 'id']], on = 'id', how = 'left')

In [20]:
# Looks like merging created some duplicate or garbage rows, need to remove these
movies.shape

(13443, 10)

In [21]:
# Here we can see which rows were duplicated
movies['title'].value_counts().head()

Confessions of a Dangerous Mind                4
Pokémon 4Ever: Celebi - Voice of the Forest    4
Le Samouraï                                    4
Beauty and the Beast                           4
Wuthering Heights                              4
Name: title, dtype: int64

In [22]:
movies = movies.drop_duplicates(subset = 'title')
movies.reset_index(inplace = True, drop = True)

In [23]:
# Dataframe will now not contain any duplicate titles
movies.shape

(12757, 10)

In [24]:
# Rename the movies 'string' column to 'keywords'
movies.rename(columns = {'string': 'keywords'}, inplace = True)

### Merge actors into movies dataframe

In [25]:
def cast_list(cast):
    c_list = []
    cast = literal_eval(cast)
    for char in cast:
        c_list.append(char['name'])
    return list(set(c_list))

credits['cast_list'] = credits['cast'].apply(cast_list)

movies = pd.merge(movies, credits[['id', 'cast_list']], on = 'id', how = 'left')

movies.shape

(12772, 11)

In [26]:
movies.drop_duplicates(subset = 'title', inplace = True)
movies.reset_index(drop = True, inplace = True)

### Merge movie overview, tagline, and keywords into one string

In [27]:
full_overviews = []

for row in movies.iterrows():
    fover = ''
    if type(row[1]['overview']) == str:
        fover += ' ' + row[1]['overview']
    if type(row[1]['tagline']) == str:
        fover += ' ' + row[1]['tagline']
    if type(row[1]['keywords']) == str:
        fover += ' ' + row[1]['keywords']
    full_overviews.append(fover.strip())
        

In [28]:
movies['full_overview'] = full_overviews

### Clean text of full overviews

In [29]:
def clean_text(text):
    # Remove links
    text = re.sub('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+','', text)
    # Remove non-alphanumerics
    text = re.sub('\w*\d\w*', '', text)
    # Remove punctuation and lowercase
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text.lower())
    # Remove newline characters
    text = text.replace('\n', ' ')
    
    return text

In [30]:
movies['full_overview'] = movies['full_overview'].apply(clean_text)

In [31]:
# Remove stopwords before stemming
def remove_stopwords(text):
    clean_text = []
    for word in text.split(' '):
        if word not in STOPWORDS and (len(word) > 2):
            clean_text.append(word)
    return ' '.join(clean_text)

In [32]:
movies['full_overview'] = movies['full_overview'].apply(remove_stopwords)

In [33]:
movies.loc[0, 'full_overview']

'led woody andys toys live happily room andys birthday brings buzz lightyear scene afraid losing place andys heart woody plots buzz circumstances separate buzz woody owner duo eventually learns aside differences jealousy toy boy friendship friends rivalry boy door new toy toy comes life'

In [34]:
stemmer = PorterStemmer()

def stem_text(text):
    word_list = []
    for word in text.split(' '):
        word_list.append(stemmer.stem(word))
    return ' '.join(word_list)

In [35]:
movies['full_overview'] = movies['full_overview'].apply(stem_text)

In [36]:
movies['full_overview'].head()

0    led woodi andi toy live happili room andi birt...
1    sibl judi peter discov enchant board game open...
2    famili wed reignit ancient feud nextdoor neigh...
3    cheat mistreat step women hold breath wait elu...
4    georg bank recov daughter wed receiv news she ...
Name: full_overview, dtype: object

# Topic modeling of full movie overviews

In [41]:
def display_topics(model, feature_names, no_top_words, no_top_topics, topic_names=None):
    count = 0
    for ix, topic in enumerate(model.components_):
        if count == no_top_topics:
            break
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", (ix + 1))
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        count += 1

### NMF with TFIDF

In [42]:
vectorizer = TfidfVectorizer(stop_words = STOPWORDS, ngram_range = (1,2))
doc_word = vectorizer.fit_transform(movies['full_overview'])
nmf = NMF(20)
doc_topic = nmf.fit_transform(doc_word)
display_topics(nmf, vectorizer.get_feature_names(), 15, 20)


Topic  1
famili, home, hous, vacat, dysfunct, life, live, dysfunct famili, parent, famili secret, secret, children, kid, famili relationship, famili vacat

Topic  2
school, high, high school, student, teacher, teenag, girl, school student, teen, popular, graduat, school teacher, footbal, class, board school

Topic  3
polic, murder, cop, detect, investig, angel, lo, lo angel, agent, crime, offic, car, bank, corrupt, crimin

Topic  4
war, world war, world, soldier, nazi, armi, american, german, civil, vietnam, civil war, japanes, battl, germani, forc

Topic  5
york, new york, new, citi, york citi, street, manhattan, subway, apart, brooklyn, manhattan new, mafia, coupl, writer, work

Topic  6
love, fall, fall love, life, love life, marriag, man, meet, marri, lover, affair, relationship, love stori, young, romant

Topic  7
town, man, young, small, death, live, life, hous, mysteri, small town, evil, vampir, dead, die, girl

Topic  8
woman, director, woman director, young woman, pari, young

In [43]:
column_names = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5', 'topic6', 'topic7', 'topic8', 'topic9', 'topic10', 'topic11',
                'topic12', 'topic13', 'topic14', 'topic15', 'topic16', 'topic17', 'topic18', 'topic19', 'topic20']
doc_topic_df = pd.DataFrame(data = doc_topic, columns = column_names)


In [44]:
movies = pd.concat([movies, doc_topic_df], axis = 1)

In [45]:
# No duplicates added this time
movies.shape

(12757, 32)

# Create sparse vector representations of actors and genres

### Genres

In [46]:
all_genres = []
for genres in movies['genres']:
    all_genres += genres
all_genres = list(set(all_genres))

sparse_genres = np.zeros((movies.shape[0], len(all_genres)))

for i, movie in enumerate(movies['genres']):
    for genre in movie:
        sparse_genres[i, all_genres.index(genre)] = 1

sparse_genres = pd.DataFrame(data = sparse_genres, columns = all_genres)

In [47]:
movies = pd.concat([movies, sparse_genres], axis = 1)

### Actors

In [48]:
pop_actors = pickle.load(open('models/pop_actors.pkl', 'rb'))

sparse_actors = np.zeros((movies.shape[0], len(pop_actors)))

for i, movie in enumerate(movies['cast_list']):
    for actor in movie:
        if actor in pop_actors:
            sparse_actors[i, pop_actors.index(actor)] = 1

sparse_actors = pd.DataFrame(data = sparse_actors, columns = pop_actors)

In [49]:
movies = pd.concat([movies, sparse_actors], axis = 1)

### Lower all column names

In [50]:
col_dict = {}

for col in movies.columns:
    col_dict[col] = col.lower().replace('-', ' ')

In [51]:
movies.rename(columns = col_dict, inplace = True)

In [69]:
# Save movies dataframe
# pickle.dump(movies, open('models/movies.pkl', 'wb'))

### Create array for movie properties

In [52]:
movie_array = np.array(movies.iloc[:,12:])
norms = np.linalg.norm(movie_array[:,:20], axis = 1)

# Drop rows with zero norms
zero_norms = list(np.where(norms == 0)[0])
movies.drop(index = zero_norms, inplace = True)
movies.reset_index(inplace = True, drop = True)

# reset movie array and norms
movie_array = np.array(movies.iloc[:,12:])
norms = np.linalg.norm(movie_array[:,:20], axis = 1)

# Set col_names, movie_names, and movie_metadata variables
col_names = list(movies.columns[12:])
movie_names = list(movies['title'].apply(lambda x: x.lower()))
movie_metadata = movies.iloc[:, :12]

In [56]:
# For each movie in movie array. Center the movie vector representations around 0.
movie_temp = movie_array[:,:20]

for i in range(movie_temp.shape[0]):
    row = movie_temp[i, :]
    new_row = row - np.sum(row) / 20
    movie_temp[i,:] = new_row


In [75]:
# Concatenate centered representation with actor and genre info.
movie_array = np.concatenate((movie_temp, movie_array[:,20:]), axis = 1)
movie_array.shape

(12689, 10571)

In [77]:
norms = np.linalg.norm(movie_array[:,:20], axis = 1)

### Create pickles of movie_array, norms, col_names, movie_names, and movie_metadata

In [78]:
pickle.dump(movie_array, open('pickles/movie_array.pkl', 'wb'))
pickle.dump(norms, open('pickles/norms.pkl', 'wb'))
pickle.dump(col_names, open('pickles/col_names.pkl', 'wb'))
pickle.dump(movie_names, open('pickles/movie_names.pkl', 'wb'))
pickle.dump(movie_metadata, open('pickles/movie_metadata.pkl', 'wb'))