## Importing Libraries

In [49]:
import numpy as np
import pandas as pd

# to dump model
import pickle

# to convert text into vector form
from sklearn.feature_extraction.text import CountVectorizer

# To calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

## Importing Dataset

In [50]:
path = "Dataset/MoviesOnStreamingPlatforms_updated.csv"
df = pd.read_csv(path)

df.shape

(16744, 16)

## Exploratory Data Analysis

In [51]:
df.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,1,Inception,2010,13+,8.8,87%,1.0,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18+,8.7,87%,1.0,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1.0,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,4,Back to the Future,1985,7+,8.5,96%,1.0,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1.0,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [52]:
# Columns Description 
print(df.columns)

Index(['ID', 'Title', 'Year', 'Age', 'IMDb', 'Rotten Tomatoes', 'Netflix',
       'Hulu', 'Prime Video', 'Disney+', 'Type', 'Directors', 'Genres',
       'Country', 'Language', 'Runtime'],
      dtype='object')


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16744 entries, 0 to 16743
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               16744 non-null  int64  
 1   Title            16744 non-null  object 
 2   Year             16744 non-null  int64  
 3   Age              7354 non-null   object 
 4   IMDb             16173 non-null  float64
 5   Rotten Tomatoes  5158 non-null   object 
 6   Netflix          16744 non-null  float64
 7   Hulu             16744 non-null  int64  
 8   Prime Video      16744 non-null  int64  
 9   Disney+          16744 non-null  int64  
 10  Type             16744 non-null  int64  
 11  Directors        16018 non-null  object 
 12  Genres           16469 non-null  object 
 13  Country          16309 non-null  object 
 14  Language         16145 non-null  object 
 15  Runtime          16152 non-null  float64
dtypes: float64(3), int64(6), object(7)
memory usage: 2.0+ MB


In [54]:
# Important features
# ['id','Title','Generes','Director','Country','Language']

In [55]:
df2 = df[['ID','Title','Genres','Directors','Country','Language']]

In [56]:
df2.head()

Unnamed: 0,ID,Title,Genres,Directors,Country,Language
0,1,Inception,"Action,Adventure,Sci-Fi,Thriller",Christopher Nolan,"United States,United Kingdom","English,Japanese,French"
1,2,The Matrix,"Action,Sci-Fi","Lana Wachowski,Lilly Wachowski",United States,English
2,3,Avengers: Infinity War,"Action,Adventure,Sci-Fi","Anthony Russo,Joe Russo",United States,English
3,4,Back to the Future,"Adventure,Comedy,Sci-Fi",Robert Zemeckis,United States,English
4,5,"The Good, the Bad and the Ugly",Western,Sergio Leone,"Italy,Spain,West Germany",Italian


In [57]:
df2.describe()

Unnamed: 0,ID
count,16744.0
mean,8372.5
std,4833.720789
min,1.0
25%,4186.75
50%,8372.5
75%,12558.25
max,16744.0


In [58]:
# Remove Null values
print(df2.shape)
df2.isnull().sum()

(16744, 6)


ID             0
Title          0
Genres       275
Directors    726
Country      435
Language     599
dtype: int64

In [59]:
df2 = df2.dropna()
print(df2.shape)
df2.isnull().sum()

(15677, 6)


ID           0
Title        0
Genres       0
Directors    0
Country      0
Language     0
dtype: int64

In [60]:
df2.duplicated().sum()

0

In [61]:
df2.head()

Unnamed: 0,ID,Title,Genres,Directors,Country,Language
0,1,Inception,"Action,Adventure,Sci-Fi,Thriller",Christopher Nolan,"United States,United Kingdom","English,Japanese,French"
1,2,The Matrix,"Action,Sci-Fi","Lana Wachowski,Lilly Wachowski",United States,English
2,3,Avengers: Infinity War,"Action,Adventure,Sci-Fi","Anthony Russo,Joe Russo",United States,English
3,4,Back to the Future,"Adventure,Comedy,Sci-Fi",Robert Zemeckis,United States,English
4,5,"The Good, the Bad and the Ugly",Western,Sergio Leone,"Italy,Spain,West Germany",Italian


In [62]:
# Split at comma and space
def split_at_comma(string):
    return string.split(",")
def split_at_space(string):
    return string.split(" ")

In [63]:
split_at_comma('Action,Adventure,Sci-Fi,Thriller')
split_at_space("The Matrix")

['The', 'Matrix']

In [64]:
df2['Genres'] = df2['Genres'].apply(split_at_comma)
df2["Directors"] = df2['Directors'].apply(split_at_comma)
df2["Country"] = df2['Country'].apply(split_at_comma)
df2["Language"] = df2['Language'].apply(split_at_comma)

In [65]:
df2['Title_feature']=df2['Title']
df2["Title_feature"] = df2['Title_feature'].apply(split_at_space)

In [66]:
df2

Unnamed: 0,ID,Title,Genres,Directors,Country,Language,Title_feature
0,1,Inception,"[Action, Adventure, Sci-Fi, Thriller]",[Christopher Nolan],"[United States, United Kingdom]","[English, Japanese, French]",[Inception]
1,2,The Matrix,"[Action, Sci-Fi]","[Lana Wachowski, Lilly Wachowski]",[United States],[English],"[The, Matrix]"
2,3,Avengers: Infinity War,"[Action, Adventure, Sci-Fi]","[Anthony Russo, Joe Russo]",[United States],[English],"[Avengers:, Infinity, War]"
3,4,Back to the Future,"[Adventure, Comedy, Sci-Fi]",[Robert Zemeckis],[United States],[English],"[Back, to, the, Future]"
4,5,"The Good, the Bad and the Ugly",[Western],[Sergio Leone],"[Italy, Spain, West Germany]",[Italian],"[The, Good,, the, Bad, and, the, Ugly]"
...,...,...,...,...,...,...,...
16739,16740,The Ghosts of Buxley Hall,"[Comedy, Family, Fantasy, Horror]",[Bruce Bilson],[United States],[English],"[The, Ghosts, of, Buxley, Hall]"
16740,16741,The Poof Point,"[Comedy, Family, Sci-Fi]",[Neal Israel],[United States],[English],"[The, Poof, Point]"
16741,16742,Sharks of Lost Island,[Documentary],[Neil Gelinas],[United States],[English],"[Sharks, of, Lost, Island]"
16742,16743,Man Among Cheetahs,[Documentary],[Richard Slater-Jones],[United States],[English],"[Man, Among, Cheetahs]"


In [67]:
def remove_space(string_array):
    lst = []
    for i in string_array:
        lst.append(i.replace(" ", ""))
    return lst  

In [68]:
remove_space(["gghghgg hjhh","wejlfnwe kjfjwek"])

['gghghgghjhh', 'wejlfnwekjfjwek']

In [69]:
df2["Genres"] = df2['Genres'].apply(remove_space)
df2["Directors"] = df2['Directors'].apply(remove_space)
df2["Country"] = df2['Country'].apply(remove_space)
df2["Language"] = df2['Language'].apply(remove_space)

In [70]:
df2

Unnamed: 0,ID,Title,Genres,Directors,Country,Language,Title_feature
0,1,Inception,"[Action, Adventure, Sci-Fi, Thriller]",[ChristopherNolan],"[UnitedStates, UnitedKingdom]","[English, Japanese, French]",[Inception]
1,2,The Matrix,"[Action, Sci-Fi]","[LanaWachowski, LillyWachowski]",[UnitedStates],[English],"[The, Matrix]"
2,3,Avengers: Infinity War,"[Action, Adventure, Sci-Fi]","[AnthonyRusso, JoeRusso]",[UnitedStates],[English],"[Avengers:, Infinity, War]"
3,4,Back to the Future,"[Adventure, Comedy, Sci-Fi]",[RobertZemeckis],[UnitedStates],[English],"[Back, to, the, Future]"
4,5,"The Good, the Bad and the Ugly",[Western],[SergioLeone],"[Italy, Spain, WestGermany]",[Italian],"[The, Good,, the, Bad, and, the, Ugly]"
...,...,...,...,...,...,...,...
16739,16740,The Ghosts of Buxley Hall,"[Comedy, Family, Fantasy, Horror]",[BruceBilson],[UnitedStates],[English],"[The, Ghosts, of, Buxley, Hall]"
16740,16741,The Poof Point,"[Comedy, Family, Sci-Fi]",[NealIsrael],[UnitedStates],[English],"[The, Poof, Point]"
16741,16742,Sharks of Lost Island,[Documentary],[NeilGelinas],[UnitedStates],[English],"[Sharks, of, Lost, Island]"
16742,16743,Man Among Cheetahs,[Documentary],[RichardSlater-Jones],[UnitedStates],[English],"[Man, Among, Cheetahs]"


In [71]:
df2["tags"] = df2['Genres']+df2['Directors']+df2['Country'] + df2 ['Language']

In [72]:
# To add title as features of movies
#df2["tags"] = df2 ['Title_feature']+df2['Genres']+df2['Directors']+df2['Country'] + df2 ['Language']

In [73]:
df3 = df2[['ID','Title','tags']]
df3

Unnamed: 0,ID,Title,tags
0,1,Inception,"[Action, Adventure, Sci-Fi, Thriller, Christop..."
1,2,The Matrix,"[Action, Sci-Fi, LanaWachowski, LillyWachowski..."
2,3,Avengers: Infinity War,"[Action, Adventure, Sci-Fi, AnthonyRusso, JoeR..."
3,4,Back to the Future,"[Adventure, Comedy, Sci-Fi, RobertZemeckis, Un..."
4,5,"The Good, the Bad and the Ugly","[Western, SergioLeone, Italy, Spain, WestGerma..."
...,...,...,...
16739,16740,The Ghosts of Buxley Hall,"[Comedy, Family, Fantasy, Horror, BruceBilson,..."
16740,16741,The Poof Point,"[Comedy, Family, Sci-Fi, NealIsrael, UnitedSta..."
16741,16742,Sharks of Lost Island,"[Documentary, NeilGelinas, UnitedStates, English]"
16742,16743,Man Among Cheetahs,"[Documentary, RichardSlater-Jones, UnitedState..."


In [74]:
def join_list(x):
    return " ".join(x)

In [75]:
x = ['Action', 'Adventure', 'Sci-Fi', 'Thriller'] 
print(join_list(x))

Action Adventure Sci-Fi Thriller


In [76]:
df3.shape

(15677, 3)

In [77]:
df3['tags']=df3['tags'].apply(join_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['tags']=df3['tags'].apply(join_list)


In [78]:
df3['tags'][0]

'Action Adventure Sci-Fi Thriller ChristopherNolan UnitedStates UnitedKingdom English Japanese French'

## Text to Vector

In [79]:
cv = CountVectorizer(max_features=6000,stop_words='english')
vector = cv.fit_transform(df3['tags']).toarray()
vector.shape

(15677, 6000)

In [80]:
print(vector)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Cosine Similarity

In [81]:
array_vec_1 = np.array([[40,11,14,11,11]])
array_vec_2 = np.array([[40,10,14,11,11]])
cosine_similarity(array_vec_1,array_vec_2)

array([[0.99977922]])

In [82]:
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.56980288, 0.63960215, ..., 0.30151134, 0.30151134,
        0.24618298],
       [0.56980288, 1.        , 0.6681531 , ..., 0.37796447, 0.37796447,
        0.3086067 ],
       [0.63960215, 0.6681531 , 1.        , ..., 0.35355339, 0.35355339,
        0.28867513],
       ...,
       [0.30151134, 0.37796447, 0.35355339, ..., 1.        , 0.75      ,
        0.61237244],
       [0.30151134, 0.37796447, 0.35355339, ..., 0.75      , 1.        ,
        0.61237244],
       [0.24618298, 0.3086067 , 0.28867513, ..., 0.61237244, 0.61237244,
        1.        ]])

In [83]:
index = df3[df3['Title'] == "Inception"].index[0]
index

0

## Recommend Function

In [84]:
def recommend(movie):
    index = df3[df3['Title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:11]:
        print(df3.iloc[i[0]].Title)
        

In [86]:
recommend("Inception")

Tomorrow
Star Trek: Insurrection
Arcadia
Independents' Day
Super Cyclone
Godzilla
Doom: Annihilation
Captain America: The Winter Soldier
Zombie Apocalypse
10.0 Earthquake


## Dump Model

In [88]:
pickle.dump(df,open('Model/movies.pkl','wb'))
pickle.dump(similarity,open('Model/similarity.pkl','wb'))