In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

pd.set_option('display.max_columns',None)

In [2]:
movies=pd.read_csv('dataset/tmdb_5000_movies.csv')
credits=pd.read_csv('dataset/tmdb_5000_credits.csv')

In [3]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### we can marge credit and movies dataframes either based on movie_id or title

In [5]:
df=pd.merge(movies,credits,how='inner',on='title')

In [6]:
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
df.shape

(4809, 23)

#### Budget: Not important when recommending movies, as low-budget movies can also be good.
#### Original Language: Not keeping because 93.78% of movies are in English.
#### Original Title: Not keeping because it can be in other languages. Keeping the title instead of the original title.

# Import features
##### genres 
##### movie_id
##### keywords
##### title
##### overview
##### cast
##### crew

In [10]:
movie_df=df[['movie_id','genres','keywords','title','overview','cast','crew']]
movie_df.head(1)

Unnamed: 0,movie_id,genres,keywords,title,overview,cast,crew
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [11]:
movie_df.isnull().sum()

movie_id    0
genres      0
keywords    0
title       0
overview    3
cast        0
crew        0
dtype: int64

In [12]:
movie_df.dropna(axis=0,inplace=True)
movie_df.reset_index(drop=True,inplace=True)

In [13]:
movie_df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
4801    False
4802    False
4803    False
4804    False
4805    False
Length: 4806, dtype: bool

In [14]:
def feature_conversion(feature,df,key):
    for i in range(len(df[feature])):
        try:
            genres=json.loads(df[feature][i])
        except(TypeError, json.JSONDecodeError):
             genres = [] 
        genre_list=[]
        counter=0
        if genres:
            for data in genres:
                if feature=='crew' and data['job']=='Director':          # from crew taking only director name
                    genre_list.append(data[key])
                    
                elif feature == 'cast' and counter < 3:                    # only top 3 names we are taking.
                    counter +=1
                    genre_list.append(data.get(key, 'Unknown'))
                    
                elif key.lower()=='name' and feature not in ('crew','cast'):
                    genre_list.append(data[key])
                
            df[feature][i]=genre_list
            # df.at[i, feature] = genre_list

In [15]:
feature_conversion('genres',movie_df,'name')
feature_conversion('keywords',movie_df,'name')
feature_conversion('crew',movie_df,'name')
feature_conversion('cast',movie_df,'name')

In [16]:
movie_df.head()

Unnamed: 0,movie_id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [17]:
movie_df['overview']=movie_df['overview'].apply(lambda x:str(x).split())
movie_df['cast'].apply(lambda x:[i.replace(" ","") for i in x])

0        [SamWorthington, ZoeSaldana, SigourneyWeaver]
1           [JohnnyDepp, OrlandoBloom, KeiraKnightley]
2            [DanielCraig, ChristophWaltz, LéaSeydoux]
3            [ChristianBale, MichaelCaine, GaryOldman]
4          [TaylorKitsch, LynnCollins, SamanthaMorton]
                             ...                      
4801    [CarlosGallardo, JaimedeHoyos, PeterMarquardt]
4802         [EdwardBurns, KerryBishé, MarshaDietlein]
4803           [EricMabius, KristinBooth, CrystalLowe]
4804            [DanielHenney, ElizaCoupe, BillPaxton]
4805    [DrewBarrymore, BrianHerzlinger, CoreyFeldman]
Name: cast, Length: 4806, dtype: object

In [18]:
for feature in ['cast','genres','crew','keywords']:
    movie_df[feature]=movie_df[feature].apply(lambda x:[i.replace(" ","") for i in x])

In [19]:
movie_df['tags']=movie_df['overview']+movie_df['keywords']+movie_df['genres']+movie_df['cast']+movie_df['crew']

In [20]:
df=pd.DataFrame(movie_df,columns=['movie_id','title','tags'])

In [21]:
df['tags']=df['tags'].apply(lambda x:" ".join(x))

In [22]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\40000433\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
WordNetLemmatizer().lemmatize('going')

'going'

In [24]:
stop_words=stopwords.words('english')

In [25]:
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.stem.porter import PorterStemmer

In [26]:
lemmatizer=WordNetLemmatizer()
ps=PorterStemmer()

In [27]:
for index,word in enumerate(df['tags']):
    y=[]
    words=word_tokenize(word)
    lemmatized_words= [ps.stem(word.lower()) for word in words if word not in stop_words]
    sentences=' '.join(lemmatized_words)
    df['tags'][index]=sentences
    # print(sentences)
    

In [28]:
# df['tags']=df['tags'].apply(lambda x: x.lower())

In [29]:
cv=CountVectorizer(max_features=5000,lowercase=True,stop_words='english')

In [30]:
vectors=cv.fit_transform(df['tags']).toarray()

In [31]:
feature_names =cv.get_feature_names_out()
print(feature_names)

# with open('feature.txt','w') as file:
#     for feature in feature_names:
#         file.write(feature +'\n')

['000' '007' '10' ... 'zone' 'zoo' 'zooeydeschanel']


In [32]:
# with open("feature.txt", "r") as file:
#     features = file.readlines() 


In [33]:
lst=[]
for feature in feature_names:
    lst.append(feature)

In [34]:
lst

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '150',
 '16',
 '17',
 '17th',
 '18',
 '1890',
 '18th',
 '18thcenturi',
 '19',
 '1910',
 '1920',
 '1930',
 '1940',
 '1944',
 '1950',
 '1960',
 '1970',
 '1971',
 '1974',
 '1976',
 '1980',
 '1985',
 '1990',
 '1999',
 '19th',
 '19thcenturi',
 '20',
 '200',
 '2003',
 '2009',
 '20th',
 '21st',
 '23',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '70',
 '80',
 'aaron',
 'aaroneckhart',
 'abandon',
 'abbi',
 'abduct',
 'abigailbreslin',
 'abil',
 'abl',
 'aboard',
 'aborigin',
 'absenc',
 'abus',
 'academ',
 'academi',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'account',
 'accus',
 'ace',
 'achiev',
 'acquaint',
 'acquir',
 'act',
 'action',
 'actionhero',
 'activ',
 'activist',
 'actor',
 'actress',
 'actual',
 'ad',
 'adam',
 'adamsandl',
 'adamshankman',
 'adapt',
 'add',
 'addict',
 'addit',
 'adjust',
 'admir',
 'admit',
 'adolesc',
 'adopt',
 'ador',
 'adrie

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
similarity=cosine_similarity(vectors)
similarity

array([[1.        , 0.0836242 , 0.08492078, ..., 0.06362848, 0.02360961,
        0.        ],
       [0.0836242 , 1.        , 0.06154575, ..., 0.02305715, 0.        ,
        0.        ],
       [0.08492078, 0.06154575, 1.        , ..., 0.04682929, 0.        ,
        0.        ],
       ...,
       [0.06362848, 0.02305715, 0.04682929, ..., 1.        , 0.05858749,
        0.04087596],
       [0.02360961, 0.        , 0.        , ..., 0.05858749, 1.        ,
        0.04550158],
       [0.        , 0.        , 0.        , ..., 0.04087596, 0.04550158,
        1.        ]])

#### The logic is that if someone provides us with a movie name, I first check the index of that movie in our data. Then, I calculate the similarity rate of that movie with others and return the top 5 most similar movies.

In [125]:
def recommendation(movie):
    index=df[df['title'].str.strip().str.lower()==movie.strip().lower()].index[0];
    for movie_index,similarity_rate in sorted(list(enumerate(similarity[index])),reverse=True,key=lambda x: x[1])[1:6]:
        print(movie_index)
        recommend=df.iloc[movie_index]['title']
        print(recommend)
    

In [127]:
recommendation('Avatar ')

2405
Aliens
3728
Falcon Rising
1214
Aliens vs Predator: Requiem
539
Titan A.E.
507
Independence Day


In [41]:
df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in 22nd centuri , parapleg marin dispatch moon..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa , long believ dead , come bac..."
2,206647,Spectre,a cryptic messag bond ’ past send trail uncov ...
3,49026,The Dark Knight Rises,"follow death district attorney harvey dent , b..."
4,49529,John Carter,"john carter war-weari , former militari captai..."
...,...,...,...
4801,9367,El Mariachi,el mariachi want play guitar carri famili trad...
4802,72766,Newlyweds,a newlyw coupl 's honeymoon upend arriv respec...
4803,231617,"Signed, Sealed, Delivered","`` sign , seal , deliv '' introduc dedic quart..."
4804,126186,Shanghai Calling,when ambiti new york attorney sam sent shangha...


In [42]:
df[1:2]['title']

1    Pirates of the Caribbean: At World's End
Name: title, dtype: object

In [43]:
for movie_index,similarity_rate in sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x: x[1])[1:6]:
    print(df.iloc[movie_index]['title'])

Aliens
Falcon Rising
Aliens vs Predator: Requiem
Titan A.E.
Independence Day


In [44]:
list(enumerate(similarity[0]))

[(0, 1.0000000000000002),
 (1, 0.08362420100070908),
 (2, 0.08492077756084468),
 (3, 0.07252377242938948),
 (4, 0.18685673434682065),
 (5, 0.10699012312772824),
 (6, 0.037483165639491674),
 (7, 0.16349964255346808),
 (8, 0.05661385170722978),
 (9, 0.0915018021743355),
 (10, 0.10826639239215334),
 (11, 0.09443843292997713),
 (12, 0.09078412990032037),
 (13, 0.042051713353118),
 (14, 0.1316245316231641),
 (15, 0.0625407298837625),
 (16, 0.07692307692307693),
 (17, 0.14351616441832768),
 (18, 0.13903017140231755),
 (19, 0.0782510703046739),
 (20, 0.07073278556897407),
 (21, 0.11503946170861017),
 (22, 0.06280742930213278),
 (23, 0.08627959628145762),
 (24, 0.05264981264926564),
 (25, 0.049813548138671795),
 (26, 0.1465158871516513),
 (27, 0.18762218965128746),
 (28, 0.11503946170861017),
 (29, 0.063077570029677),
 (30, 0.06793662204867575),
 (31, 0.1513068831672006),
 (32, 0.08200923681047297),
 (33, 0.09245003270420485),
 (34, 0.0),
 (35, 0.08920515501750789),
 (36, 0.14846644469434814),

In [45]:
list(enumerate(similarity[0]))

[(0, 1.0000000000000002),
 (1, 0.08362420100070908),
 (2, 0.08492077756084468),
 (3, 0.07252377242938948),
 (4, 0.18685673434682065),
 (5, 0.10699012312772824),
 (6, 0.037483165639491674),
 (7, 0.16349964255346808),
 (8, 0.05661385170722978),
 (9, 0.0915018021743355),
 (10, 0.10826639239215334),
 (11, 0.09443843292997713),
 (12, 0.09078412990032037),
 (13, 0.042051713353118),
 (14, 0.1316245316231641),
 (15, 0.0625407298837625),
 (16, 0.07692307692307693),
 (17, 0.14351616441832768),
 (18, 0.13903017140231755),
 (19, 0.0782510703046739),
 (20, 0.07073278556897407),
 (21, 0.11503946170861017),
 (22, 0.06280742930213278),
 (23, 0.08627959628145762),
 (24, 0.05264981264926564),
 (25, 0.049813548138671795),
 (26, 0.1465158871516513),
 (27, 0.18762218965128746),
 (28, 0.11503946170861017),
 (29, 0.063077570029677),
 (30, 0.06793662204867575),
 (31, 0.1513068831672006),
 (32, 0.08200923681047297),
 (33, 0.09245003270420485),
 (34, 0.0),
 (35, 0.08920515501750789),
 (36, 0.14846644469434814),

In [46]:
import pickle

In [47]:
# with open('movie_recommendation.pkl','wb') as f:
#     pickle.dump(df,f)

In [48]:
df['title'].values.tolist()

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz: The Great and Powerful',
 'The Amazing Spider-Man 2',

In [49]:
df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in 22nd centuri , parapleg marin dispatch moon..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa , long believ dead , come bac..."
2,206647,Spectre,a cryptic messag bond ’ past send trail uncov ...
3,49026,The Dark Knight Rises,"follow death district attorney harvey dent , b..."
4,49529,John Carter,"john carter war-weari , former militari captai..."


In [103]:
import os
os.makedirs('cleaned_data', exist_ok=True)
df.to_csv('cleaned_data/movies_data.csv')

In [109]:
os.makedirs('model',exist_ok=True)

with open('model/vectors_cosine_similarity.pkl','wb') as file:
    pickle.dump(similarity,file)