In [None]:
# change title to original_title
# data source updated (~8000 movies)

In [1]:
import pandas as pd 
import numpy as np 

df1=pd.read_csv('movies_matched.csv') # see hybrid model data cleaning

In [2]:
# Parse the stringified features into their corresponding python objects
features = ['actors', 'director', 'writer', 'genre']

In [3]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, str):
        split_list = x.split(',')
        if len(split_list) > 1:
            names = [i for i in split_list]
            # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
            if len(names) > 3:
                names = names[:3]
        elif len(split_list) == 1:
            names = [x]
    else:
        names = []
    return names

In [4]:
features = ['actors', 'director', 'writer', 'genre']
# features = ['actors']
for feature in features:
    df1[feature] = df1[feature].apply(get_list)

In [5]:
# Print the new features of the first 3 films
df1[['original_title', 'actors', 'director', 'writer', 'genre']].head(3)

Unnamed: 0,original_title,actors,director,writer,genre
0,The Birth of a Nation,"[Henry B. Walthall, Lillian Gish, Mae Marsh]",[D.W. Griffith],"[Thomas Dixon Jr., Thomas Dixon Jr.]","[Drama, History, War]"
1,"20,000 Leagues Under the Sea","[Dan Hanlon, Edna Pendleton, Curtis Benton]",[Stuart Paton],[Jules Verne],"[Action, Adventure, Sci-Fi]"
2,Intolerance: Love's Struggle Throughout the Ages,"[Lillian Gish, Mae Marsh, Robert Harron]",[D.W. Griffith],"[D.W. Griffith, Anita Loos]","[Drama, History]"


In [6]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [7]:
# Apply clean_data function to your features.
features = ['actors', 'director', 'writer', 'genre']

for feature in features:
    df1[feature] = df1[feature].apply(clean_data)

In [8]:
df1[['original_title', 'actors', 'director', 'writer', 'genre']].head(3)

Unnamed: 0,original_title,actors,director,writer,genre
0,The Birth of a Nation,"[henryb.walthall, lilliangish, maemarsh]",[d.w.griffith],"[thomasdixonjr., thomasdixonjr.]","[drama, history, war]"
1,"20,000 Leagues Under the Sea","[danhanlon, ednapendleton, curtisbenton]",[stuartpaton],[julesverne],"[action, adventure, sci-fi]"
2,Intolerance: Love's Struggle Throughout the Ages,"[lilliangish, maemarsh, robertharron]",[d.w.griffith],"[d.w.griffith, anitaloos]","[drama, history]"


In [9]:
def create_soup(x):
    return ' '.join(x['writer']) + ' ' + ' '.join(x['actors']) + ' ' + ' '.join(x['director']) + ' ' + ' '.join(x['genre'])
df1['soup'] = df1.apply(create_soup, axis=1)

In [10]:
df1['soup'].head(3)

0    thomasdixonjr. thomasdixonjr. henryb.walthall ...
1    julesverne danhanlon ednapendleton curtisbento...
2    d.w.griffith anitaloos lilliangish maemarsh ro...
Name: soup, dtype: object

In [11]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df1['soup'])

In [12]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [13]:
# Reset index of our main DataFrame and construct reverse mapping as before
df1 = df1.reset_index()
indices = pd.Series(df1.index, index=df1['original_title'])

In [16]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim2):
    # Get the index of the movie that matches the title
    idx = indices[original_title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df1['original_title'].iloc[movie_indices]

We can now reuse our **get_recommendations()** function by passing in the new **cosine_sim2** matrix as your second argument.

In [18]:
get_recommendations('Inception', cosine_sim2)

6609                    Interstellar
7252           The Dark Knight Rises
6438                    The Prestige
8497                         Dunkirk
6356                 The Dark Knight
7170                          Looper
1       20,000 Leagues Under the Sea
1396                       Moonraker
5759                   Batman Begins
5834                         Stealth
Name: original_title, dtype: object

In [20]:
get_recommendations('Iron Man 2', cosine_sim2)

7196                  Iron Man Three
5754                        Iron Man
8020         Avengers: Age of Ultron
4348                      Spider-Man
5403                    Spider-Man 2
7931          Spider-Man: Homecoming
8303      Captain America: Civil War
8418          Avengers: Infinity War
6649                    The Avengers
1       20,000 Leagues Under the Sea
Name: original_title, dtype: object