In [2]:
import pandas as pd
import numpy as np

In [3]:
movies = pd.read_csv("movie.csv")

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [5]:
movies["genres"]=movies["genres"].replace('(no genres listed)','None')

In [6]:
movies.drop(movies[movies['genres'] == "None"].index, inplace = True)

In [7]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27272,131252,Forklift Driver Klaus: The First Day on the Jo...,Comedy|Horror
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure


In [8]:
import re
def clean_title(title):
    return re.sub("[^a-zA-Z ]", "", title)
movies["clean_title"] = movies["title"].apply(clean_title)

In [9]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II
...,...,...,...,...
27272,131252,Forklift Driver Klaus: The First Day on the Jo...,Comedy|Horror,Forklift Driver Klaus The First Day on the Job
27273,131254,Kein Bund für's Leben (2007),Comedy,Kein Bund frs Leben
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,Feuer Eis Dosenbier
27275,131258,The Pirates (2014),Adventure,The Pirates


In [10]:
def clean_genres(genres):
    return re.sub("[^a-zA-Z ]", " ", genres)
movies["clean_genres"] = movies["genres"].apply(clean_genres)

In [11]:
movies

Unnamed: 0,movieId,title,genres,clean_title,clean_genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,Comedy
...,...,...,...,...,...
27272,131252,Forklift Driver Klaus: The First Day on the Jo...,Comedy|Horror,Forklift Driver Klaus The First Day on the Job,Comedy Horror
27273,131254,Kein Bund für's Leben (2007),Comedy,Kein Bund frs Leben,Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,Feuer Eis Dosenbier,Comedy
27275,131258,The Pirates (2014),Adventure,The Pirates,Adventure


In [12]:
movies = movies[["movieId", "clean_title", "clean_genres"]]

In [13]:
movies

Unnamed: 0,movieId,clean_title,clean_genres
0,1,Toy Story,Adventure Animation Children Comedy Fantasy
1,2,Jumanji,Adventure Children Fantasy
2,3,Grumpier Old Men,Comedy Romance
3,4,Waiting to Exhale,Comedy Drama Romance
4,5,Father of the Bride Part II,Comedy
...,...,...,...
27272,131252,Forklift Driver Klaus The First Day on the Job,Comedy Horror
27273,131254,Kein Bund frs Leben,Comedy
27274,131256,Feuer Eis Dosenbier,Comedy
27275,131258,The Pirates,Adventure


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,10))
tfidf = vectorizer.fit_transform(movies["clean_genres"])

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
def search(genre):
    genre = clean_genres(genre)         
    query_vec = vectorizer.transform([genre])           
    similarity = cosine_similarity(query_vec,tfidf).flatten()           
    indices = np.argpartition(similarity, -20)[-20:]   
    results = movies.iloc[indices][::-1]          
    return results

In [17]:
search("adventure")

Unnamed: 0,movieId,clean_title,clean_genres
2452,2537,Beyond the Poseidon Adventure,Adventure
7999,8682,Last Valley The,Adventure
23012,109833,Blackbeard the Pirate,Adventure
23097,110134,Belle and Sebastien Belle et Sbastien,Adventure
13536,67098,Billy Budd,Adventure
18591,92516,She Gods of Shark Reef,Adventure
25942,123499,The Hunters,Adventure
15337,78184,The Black Rose,Adventure
2392,2477,Firewalker,Adventure
14430,72276,Indian Tomb The Das indische Grabmal,Adventure


In [18]:
import pickle
pickle.dump(movies, open("movies.pkl", "wb"))

In [19]:
pickle.dump(movies.to_dict(),open("movies_dict.pkl", "wb"))