In [100]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [101]:
origin_db = pd.read_csv('tmdb_5000_movies.csv')

In [102]:
# Initialize my db
selected_feature = ['genres']
selected_index = ['original_title']
db = pd.DataFrame()
db[selected_feature ] = origin_db[selected_feature]
db['index'] = origin_db[selected_index]
db.set_index('index', inplace=True)

In [103]:
# Parsing json format 'genres' into space-splited list
db['genres'] = db['genres'].apply(lambda genres: ' '.join(map(lambda item: item['name'], json.loads(genres))))
db

Unnamed: 0_level_0,genres
index,Unnamed: 1_level_1
Avatar,Action Adventure Fantasy Science Fiction
Pirates of the Caribbean: At World's End,Adventure Fantasy Action
Spectre,Action Adventure Crime
The Dark Knight Rises,Action Crime Drama Thriller
John Carter,Action Adventure Science Fiction
...,...
El Mariachi,Action Crime Thriller
Newlyweds,Comedy Romance
"Signed, Sealed, Delivered",Comedy Drama Romance TV Movie
Shanghai Calling,


In [104]:
from itertools import combinations
tf = TfidfVectorizer(analyzer=lambda x: (item for i in range(1, 6) for item in combinations(x.split(), i)))
result_matrix = tf.fit_transform(db['genres'])
result_matrix.shape

(4803, 4137)

In [105]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(result_matrix)
similarities


array([[1.        , 0.1414941 , 0.09074694, ..., 0.        , 0.        ,
        0.        ],
       [0.1414941 , 1.        , 0.11144   , ..., 0.        , 0.        ,
        0.        ],
       [0.09074694, 0.11144   , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [106]:
sim2 = pd.DataFrame(similarities)
sim2.columns = db.index
sim2.index = db.index
# sim2

index,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,Batman v Superman: Dawn of Justice,...,On The Downlow,Sanctuary: Quite a Conundrum,Bang,Primer,Cavite,El Mariachi,Newlyweds,"Signed, Sealed, Delivered",Shanghai Calling,My Date with Drew
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,1.000000,0.141494,0.090747,0.013583,0.568604,0.113150,0.0,0.568604,0.118594,0.388215,...,0.000000,0.000000,0.000000,0.054539,0.000000,0.022730,0.0000,0.000000,0.0,0.0
Pirates of the Caribbean: At World's End,0.141494,1.000000,0.111440,0.033516,0.076252,0.376915,0.0,0.076252,0.292637,0.364474,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.056087,0.0000,0.000000,0.0,0.0
Spectre,0.090747,0.111440,1.000000,0.178933,0.159596,0.204119,0.0,0.159596,0.061493,0.233754,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.299436,0.0000,0.000000,0.0,0.0
The Dark Knight Rises,0.013583,0.033516,0.178933,1.000000,0.023888,0.030552,0.0,0.023888,0.000000,0.034987,...,0.112201,0.024031,0.112201,0.063023,0.032227,0.597567,0.0000,0.004703,0.0,0.0
John Carter,0.568604,0.076252,0.159596,0.023888,1.000000,0.139667,0.0,1.000000,0.042076,0.159945,...,0.000000,0.000000,0.000000,0.095918,0.000000,0.039975,0.0000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
El Mariachi,0.022730,0.056087,0.299436,0.597567,0.039975,0.051127,0.0,0.039975,0.000000,0.058549,...,0.000000,0.040215,0.000000,0.029171,0.053931,1.000000,0.0000,0.000000,0.0,0.0
Newlyweds,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.058803,0.000000,0.000000,0.000000,0.000000,1.0000,0.115700,0.0,0.0
"Signed, Sealed, Delivered",0.000000,0.000000,0.000000,0.004703,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.041913,0.006803,0.041913,0.003632,0.000000,0.000000,0.1157,1.000000,0.0,0.0
Shanghai Calling,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.0,0.0


In [107]:
pd.DataFrame(result_matrix.todense(), columns=tf.get_feature_names(), index=db.index).sample(10, axis=0).sample(10, axis=1)

Unnamed: 0_level_0,"(Science, Fiction, Comedy, Adventure)","(Family, Fantasy, Science)","(Drama, Mystery, Romance, Science, Thriller)","(Crime, Thriller, Horror)","(Thriller, Action, Romance, Science, Adventure)","(Romance, Comedy, Adventure)","(Comedy, Family, Western)","(Adventure, Family, Music)","(Documentary,)","(Comedy, TV, Movie)"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Latter Days,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Against the Ropes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
High Fidelity,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Most Violent Year,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Let's Go to Prison,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Casablanca,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dreaming of Joseph Lees,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Black Swan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Great Debaters,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Assassination of Jesse James by the Coward Robert Ford,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
target_movie = 'John Carter'
sim2.loc[target_movie].sort_values(ascending=False)

index
Damnation Alley       1.0
Total Recall          1.0
Six-String Samurai    1.0
2012                  1.0
U.F.O.                1.0
                     ... 
Survivor              0.0
The Frozen Ground     0.0
The Painted Veil      0.0
Bad Teacher           0.0
My Date with Drew     0.0
Name: John Carter, Length: 4803, dtype: float64