In [2]:
import numpy as np
import pandas as pd

In [3]:
credits= pd.read_csv('datasets/tmdb_5000_credits.csv')
movies = pd.read_csv('datasets/tmdb_5000_movies.csv')

In [4]:
movies = movies.merge(credits, on = 'title')

In [5]:
movies = movies[['movie_id', 'title', 'genres', 'keywords', 'overview','cast', 'crew']]

In [6]:
movies.isnull().sum()
# checks columns if there are any null values and returns the total number of null values

movie_id    0
title       0
genres      0
keywords    0
overview    3
cast        0
crew        0
dtype: int64

In [7]:
# drops all the null values in the columns
movies.dropna(inplace = True)

In [8]:
movies.isnull().sum()

movie_id    0
title       0
genres      0
keywords    0
overview    0
cast        0
crew        0
dtype: int64

In [9]:
# checks if there are any duplicates in the dataframe and returns the total number of duplicates
movies.duplicated().sum()

0

<h1>To Note</h1>
we need to pre - process the data properly so that the model can learn well. 
We do this by merging the key features and selecting only the relevent data from it
1. We need to convert genres, keywords, cast, crew and overview columns to list in order to concatenate them
2. For 'cast' feature we select only the first 3 as they are the most important
3. In 'crew' we only need the director's name as only he will be recognizable by large audiences
4. For 'overview' we need to convert the string to a list and since it is not in dictionary format we use (lambda x:x.split)
5. Then we merge the features 'genres', 'keywords', 'cast', 'crew' and 'overview' to one column named 'tags'

In [10]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [11]:
movies.iloc[0].genres
# gives us a string

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
import ast
# we need to convert to a list as we get a string when appending and we need to iterate through a list in the loop of the function
# ast.literal_eval() converts inputed item to a list

ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [13]:
# .apply() gives us ability to call dataframe row - wise or column - wise
movies['genres'] = movies['genres'].apply(convert)

In [14]:
movies['keywords'] = movies['keywords'].apply(convert)

In [15]:
# creating a function to reduce the data so that we keep only first 3 cast names
def convert_cast(obj):
    L = []
    c = 0 # counter
    for i in ast.literal_eval(obj):
        if c != 3:
            L.append(i['name'])
            c += 1
    return L

In [16]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [17]:
# creating a function to reduce the data so that we keep only first 3 cast names
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [18]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [19]:
movies.head()

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [20]:
# lamda is used to split overview from a string into a list of words
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [21]:
movies['overview']

0       [In, the, 22nd, century,, a, paraplegic, Marin...
1       [Captain, Barbossa,, long, believed, to, be, d...
2       [A, cryptic, message, from, Bond’s, past, send...
3       [Following, the, death, of, District, Attorney...
4       [John, Carter, is, a, war-weary,, former, mili...
                              ...                        
4804    [El, Mariachi, just, wants, to, play, his, gui...
4805    [A, newlywed, couple's, honeymoon, is, upended...
4806    ["Signed,, Sealed,, Delivered", introduces, a,...
4807    [When, ambitious, New, York, attorney, Sam, is...
4808    [Ever, since, the, second, grade, when, he, fi...
Name: overview, Length: 4806, dtype: object

<h1>TO NOTE</h1>

We need to remove the whitespace between all elements in the list as when converting back to a list if 2 elements have same first name 
then the model will consider them to be same even if they have different surnames

For eg: Sam Worthington and Sam Mendes, even though they are 2 different people the model will take both Sams as one person and Worthington and Mendes as 
2 separate other people

In [22]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [23]:
movies.head()

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


Now we need to concatenate genre, crew, cast, keywords and overview as a new column 'tags'

In [24]:
movies['tags'] = movies['overview'] + movies['keywords'] + movies['cast'] + movies['crew'] + movies['genres']

In [25]:
movies.head()

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


We create a new dataframe for the model to use less data as well as pre - processed data as the training set

In [26]:
new_df = movies[['movie_id','title','tags']]

In [27]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [28]:
# converting the list to a string datatype
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [29]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [30]:
# making all the data into lowecase 
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [31]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


<h1>
***Come back after countvectorizer***
</h1>

<h1>Note</h1>
There may be the occurrence of 2 similar words as 2 different features (eg: 'action' and 'actions' or 'activity' and 'activities') but they may mean the same thing, just a slight change (it may be plural of the same word or its past tense usage)

To fix this we use 'stemming'

- We do this on the normal 'tags' column (before countvectorizer function)

using a module from nltk library called PorterStemmer we can find the stem word of each word the function is applied to

In [33]:
# using nltk to perform word stemming
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

# create a function to apply stemming to each word in the tags column

def stem(text):
    l =[]
    for i in text.split(): # we need to convert the string to a list of words in order to perform operations on it
        l.append(ps.stem(i)) # returns the stem word of the word in the bracket of 'ps.stem()' and appends the stem word to list of words 'l'
    return " ".join(l) # " ".join(l) will return all the words in l to make a string with all words separated by a whitespace


'''Example:
   ps.stem('loved') --> 'love'
   ps.stem('loves') --> 'love'
'''

"Example:\n   ps.stem('loved') --> 'love'\n   ps.stem('loves') --> 'love'\n"

In [34]:
# calling the stem function to perform stemming on 'tags' column and assigning the new value to it
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [35]:
new_df['tags']

0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond’ past send him on a...
3       follow the death of district attorney harvey d...
4       john carter is a war-weary, former militari ca...
                              ...                        
4804    el mariachi just want to play hi guitar and ca...
4805    a newlyw couple' honeymoon is upend by the arr...
4806    "signed, sealed, delivered" introduc a dedic q...
4807    when ambiti new york attorney sam is sent to s...
4808    ever sinc the second grade when he first saw h...
Name: tags, Length: 4806, dtype: object

<h1>Count Vectorizer</h1>
It is an sklearn module that can convert a collection of texts into a matrix of word counts

- max_features : will tell us the number of words we count (the 5000 most common words in the text)

- stop_words : will remove all the words that do not describe the movie and help only in sentence formation (eg: a, the, I, and)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [37]:
# converting tags to numeric form and storing this matrix in object 'vectors'
vectors = cv.fit_transform(new_df['tags']).toarray()

In [38]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [39]:
# I want to display all 5000 feature words
feature_names = cv.get_feature_names_out()
for feature in feature_names:
    print(feature)

000
007
10
100
11
12
13
14
15
16
17
17th
18
18th
18thcenturi
19
1910
1920
1930
1940
1944
1950
1950s
1960
1960s
1970
1970s
1971
1974
1976
1980
1985
1990
1999
19th
19thcenturi
20
200
2003
2009
20th
21st
23
24
25
30
300
3d
40
50
500
60
70
80
aaron
aaroneckhart
abandon
abduct
abigailbreslin
abil
abl
aboard
abov
abus
academ
academi
accept
access
accid
accident
acclaim
accompani
accomplish
account
accus
ace
achiev
acquaint
act
action
actionhero
activ
activist
activities
actor
actress
actual
ad
adam
adamsandl
adamshankman
adapt
add
addict
adjust
admir
admit
adolesc
adopt
ador
adrienbrodi
adult
adultanim
adulteri
adulthood
advanc
adventur
adventure
adventures
advertis
advic
advis
affair
affect
afghanistan
africa
african
africanamerican
aftercreditssting
afterlif
aftermath
ag
age
agediffer
agenc
agency
agenda
agent
agents
aggress
ago
agre
ahead
aid
aidanquinn
ail
aim
air
airplan
airplanecrash
airport
aka
al
alabama
alan
alaska
albert
alcatraz
alcohol
alecbaldwin
alex
alexkendrick
alfredhitchcoc

In [40]:
cv.get_feature_names_out().shape

(5000,)

- now go to stemming part

<h1>Main function</h1>

In order to find similar movies using the matrix table we need to find radial difference between 2 vectors and the ones with the shortest distances will be similar movies

We find angle difference instead of point to point (eucladian) distance as that is more accurate
- Eucladian distance is not reliable as there are many instances to compare with, hence the usage of angle difference

For example:
- movies with 0 degree differece are the same movie
- movies with 5 degree difference are extremely similar
- movies with 90 degree difference are very dissimilar
- movies with 180 degree difference are like opposite movies

We use **Cosine Distance**

In [41]:
# we can calculate how close the vectors are to each other using an sklearn function called cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
cosine_similarity(vectors).shape
# this returns similarity between all movies, 1 for same value and 0 for opposite value
# Eg:
# dist m1 and m1; m1 and m2; m1 and m3...m1 and m4806
# then
# dist m2 and m1; m2 and m2; m2 and m3... m2 and m4806
# like that it compares each distance till m4806 and m4806'''

(4806, 4806)

In [43]:
# this will store all similarities in a matrix called similarity
similarity = cosine_similarity(vectors)

In [44]:
# checks similarity of m1 and all movies
# 1st column is 1 as m1 is compared to itself first
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

In [45]:
# 2nd column is 1 as m2 is compared with m2
similarity[1]

array([0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
       0.02615329])

**note**: we cannot use sorted function to return dissimilar values as the index position will change according to similarity and wrong movies will be returned as index is different from original

hence we use enumerate() as this will return the similarity as well as the index position it is located at

In [46]:
sorted(list(similarity[0])) # will return ascending order from dissimilar to similar movies hence we need to reverse it

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [47]:
sorted(list(similarity[0]), reverse = True) 
# reversed to get similar movies first (we need to recommend movies with similarity close to 1)

[1.0000000000000002,
 0.28676966733820225,
 0.26901379342448517,
 0.2605130246476754,
 0.255608593705383,
 0.25038669783359574,
 0.24511108480187255,
 0.24455799402225922,
 0.23179316248638276,
 0.23174488732966075,
 0.2278389747471728,
 0.2252817784447915,
 0.22269966704152225,
 0.21853668936906193,
 0.21239769762143662,
 0.2108663315950723,
 0.2105263157894737,
 0.20443988269091456,
 0.20437977982832192,
 0.20395079136182276,
 0.2029530274475215,
 0.2029530274475215,
 0.20277677641345318,
 0.2024645717996314,
 0.2020475485519274,
 0.1979082783981174,
 0.19767387315371682,
 0.1976738731537168,
 0.19672236884115843,
 0.19252140716412977,
 0.19134594929397597,
 0.19088542889273336,
 0.19088542889273336,
 0.19007487139298027,
 0.1892994097121204,
 0.18731716231633883,
 0.1873171623163388,
 0.18693292157876878,
 0.1860807318911967,
 0.18394180184548975,
 0.18394180184548975,
 0.1813690625275029,
 0.18074256993863339,
 0.17996850826633903,
 0.17954621161490197,
 0.1777046633277277,
 0.1769

- we get values similar to dissimilar but the index has changed so we will not be able to call the original movie with similarites 
- For eg: original movie with 0.2867 similarity used to be at 60th position but now it will return the 2nd movie in the database as the movie with 0.2867 similarity
- hence we use enumerate() as that will return both original dissimilarites and original index positions

In [48]:
sorted(list(enumerate(similarity[0])), reverse=True, key = lambda x : x[1])[1:6]
# lambda function shows we are sorting movies that are most similar from 2nd movie onwards
# [1:6] means we select only the first 5 most similar to the movies

[(1214, 0.28676966733820225),
 (2405, 0.26901379342448517),
 (3728, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [49]:
# creating a function to find the index of movie entered by user

def recommend(movie):
    if movie in new_df['title'].values:
        movie_index = new_df[new_df['title'] == movie].index[0]
        distances = similarity[movie_index]
        movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
        
        for i in movies_list:
            print(new_df.iloc[i[0]].title)
    else:
        print(f"Movie '{movie}' not found in the dataset.")


In [50]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


we need movie title not index hence we use iloc

In [51]:
new_df.iloc[1214].title

'Aliens vs Predator: Requiem'

In [52]:
recommend('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf


In [53]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [54]:
import pickle

In [56]:
new_df['title'].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [57]:
pickle.dump(new_df.to_dict(), open('attributes/movies_dict.pkl', 'wb'))

In [58]:
pickle.dump(similarity,open('attributes/similarity.pkl', 'wb'))