In [38]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Importing library

In [39]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Loading Datasets from gdrive

In [40]:
df = pd.read_csv("/content/gdrive/MyDrive/college_project/data_p1.csv")

In [41]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy science friction,Avatar
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,Pirates of the Caribbean: At World's End
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,Spectre
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon Levitt,Action Thriller,The Dark Knight Rises
4,Doug Walker,Doug Walker,Rob Walker,,Documentary,Star Wars: Episode VII - The Force Awakens ...


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   director_name  4939 non-null   object
 1   actor_1_name   5036 non-null   object
 2   actor_2_name   5030 non-null   object
 3   actor_3_name   5020 non-null   object
 4   genres         5043 non-null   object
 5   movie_title    5043 non-null   object
dtypes: object(6)
memory usage: 236.5+ KB


## Checking NAN value

In [43]:
df.isna().any()

director_name     True
actor_1_name      True
actor_2_name      True
actor_3_name      True
genres           False
movie_title      False
dtype: bool

## Drop NAN value

In [44]:
df = df.dropna()

In [45]:
df.isna().any()

director_name    False
actor_1_name     False
actor_2_name     False
actor_3_name     False
genres           False
movie_title      False
dtype: bool

In [46]:
df.shape

(4919, 6)

In [47]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy science friction,Avatar
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,Pirates of the Caribbean: At World's End
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,Spectre
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon Levitt,Action Thriller,The Dark Knight Rises
5,Andrew Stanton,Daryl Sabara,Samantha Morton,Polly Walker,Action Adventure science friction,John Carter


In [48]:
df['director_name'] = df['director_name'].str.lower()
df['actor_1_name'] = df['actor_1_name'].str.lower()
df['actor_2_name'] = df['actor_2_name'].str.lower()
df['actor_3_name'] = df['actor_3_name'].str.lower()
df['genres'] = df['genres'].str.lower()
df['movie_title'] = df['movie_title'].str.lower()

In [49]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,james cameron,cch pounder,joel david moore,wes studi,action adventure fantasy science friction,avatar
1,gore verbinski,johnny depp,orlando bloom,jack davenport,action adventure fantasy,pirates of the caribbean: at world's end
2,sam mendes,christoph waltz,rory kinnear,stephanie sigman,action adventure thriller,spectre
3,christopher nolan,tom hardy,christian bale,joseph gordon levitt,action thriller,the dark knight rises
5,andrew stanton,daryl sabara,samantha morton,polly walker,action adventure science friction,john carter


In [50]:
index = []
for i in range(0 , df.shape[0]):
  index.append(i)

In [51]:
df["index"] = index

In [52]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,index
0,james cameron,cch pounder,joel david moore,wes studi,action adventure fantasy science friction,avatar,0
1,gore verbinski,johnny depp,orlando bloom,jack davenport,action adventure fantasy,pirates of the caribbean: at world's end,1
2,sam mendes,christoph waltz,rory kinnear,stephanie sigman,action adventure thriller,spectre,2
3,christopher nolan,tom hardy,christian bale,joseph gordon levitt,action thriller,the dark knight rises,3
5,andrew stanton,daryl sabara,samantha morton,polly walker,action adventure science friction,john carter,4


In [53]:
df['director_name'] = df['director_name'].str.replace('\d+', '')
df['actor_1_name'] = df['actor_1_name'].str.replace('\d+', '')
df['actor_2_name'] = df['actor_2_name'].str.replace('\d+', '')
df['actor_3_name'] = df['actor_3_name'].str.replace('\d+', '')
df['genres'] = df['genres'].str.replace('\d+', '')
df['movie_title'] = df['movie_title'].str.replace('\d+', '')

In [54]:
def combined_features(row):
    return row['director_name']+" "+row['actor_1_name']+" "+row['actor_2_name']+" "+row['actor_3_name']+" "+row["genres"]+" "+row["movie_title"]
df["combined_features"] = df.apply(combined_features, axis =1)

In [55]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,index,combined_features
0,james cameron,cch pounder,joel david moore,wes studi,action adventure fantasy science friction,avatar,0,james cameron cch pounder joel david moore wes...
1,gore verbinski,johnny depp,orlando bloom,jack davenport,action adventure fantasy,pirates of the caribbean: at world's end,1,gore verbinski johnny depp orlando bloom jack ...
2,sam mendes,christoph waltz,rory kinnear,stephanie sigman,action adventure thriller,spectre,2,sam mendes christoph waltz rory kinnear stepha...
3,christopher nolan,tom hardy,christian bale,joseph gordon levitt,action thriller,the dark knight rises,3,christopher nolan tom hardy christian bale jos...
5,andrew stanton,daryl sabara,samantha morton,polly walker,action adventure science friction,john carter,4,andrew stanton daryl sabara samantha morton po...


## Apply countVectorizer for canverting text into vector

In [56]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Find cosine Similarity

In [57]:
cosine_sim = cosine_similarity(count_matrix)

In [58]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,index,combined_features
0,james cameron,cch pounder,joel david moore,wes studi,action adventure fantasy science friction,avatar,0,james cameron cch pounder joel david moore wes...
1,gore verbinski,johnny depp,orlando bloom,jack davenport,action adventure fantasy,pirates of the caribbean: at world's end,1,gore verbinski johnny depp orlando bloom jack ...
2,sam mendes,christoph waltz,rory kinnear,stephanie sigman,action adventure thriller,spectre,2,sam mendes christoph waltz rory kinnear stepha...
3,christopher nolan,tom hardy,christian bale,joseph gordon levitt,action thriller,the dark knight rises,3,christopher nolan tom hardy christian bale jos...
5,andrew stanton,daryl sabara,samantha morton,polly walker,action adventure science friction,john carter,4,andrew stanton daryl sabara samantha morton po...


In [59]:
df['movie_title'] = df['movie_title'].str.strip()
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,index,combined_features
0,james cameron,cch pounder,joel david moore,wes studi,action adventure fantasy science friction,avatar,0,james cameron cch pounder joel david moore wes...
1,gore verbinski,johnny depp,orlando bloom,jack davenport,action adventure fantasy,pirates of the caribbean: at world's end,1,gore verbinski johnny depp orlando bloom jack ...
2,sam mendes,christoph waltz,rory kinnear,stephanie sigman,action adventure thriller,spectre,2,sam mendes christoph waltz rory kinnear stepha...
3,christopher nolan,tom hardy,christian bale,joseph gordon levitt,action thriller,the dark knight rises,3,christopher nolan tom hardy christian bale jos...
5,andrew stanton,daryl sabara,samantha morton,polly walker,action adventure science friction,john carter,4,andrew stanton daryl sabara samantha morton po...


Finding the index of the movie

In [60]:
movie_user_likes = "spectre"

def get_index_from_title(title):
    return df[df['movie_title'] == title]["index"].values[0]

movie_index = get_index_from_title(movie_user_likes)

In [61]:
movie_index

2

In [62]:
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [63]:
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

## Similar Movies

In [64]:
def get_title_from_index(index):
    return df[df.index == index]["movie_title"].values[0]
i=0
for movie in sorted_similar_movies:
    print(get_title_from_index(movie[0]))
    i=i+1
    if i>15:
        break

spectre
jurassic world
aloft
superman returns
gods of egypt
jupiter ascending
the wolfman
the aviator
the majestic
the matrix
john carter
the manchurian candidate
snake eyes
cars
captain phillips
the ant bully
