# INDIAN MOVIES RECOMMENDER SYSTEM

## Packages

In [2]:
import os
import pandas as pd
import numpy as np
import re
import ast

from nltk import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Data

In [4]:
movie = pd.read_csv(r'indian_movies\bollywood_full_1950-2019.csv')
crew = pd.read_csv(r'indian_movies\bollywood_crew_1950-2019.csv')
crew_data = pd.read_csv(r'indian_movies\bollywood_crew_data_1950-2019.csv')
writer_data = pd.read_csv(r'indian_movies\bollywood_writers_data_1950-2019.csv')
ids = pd.read_csv(r'indian_movies\ids.csv')

In [3]:
movie.head(1)

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,Action|Drama|War,8.4,35112.0,Divided over five chapters the film chronicle...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA)


In [5]:
writer_data.head(1)

Unnamed: 0,crew_id,name,born_year,death_year,profession,known_for
0,nm0000636,William Shakespeare,1564,1616,writer|soundtrack|miscellaneous,tt3894536|tt5377528|tt5932378|tt8632012


In [6]:
crew_data.head(1)

Unnamed: 0,crew_id,name,born_year,death_year,profession,known_for
0,nm0001408,Shekhar Kapur,1945,\N,actor|director|producer,tt0240510|tt0414055|tt0109206|tt0127536


In [7]:
crew.head(1)

Unnamed: 0,imdb_id,directors,writers
0,tt0042184,nm0025608,nm0025608|nm0324690


## Pre-Process

### Movies Data

In [4]:
movie.columns

Index(['title_x', 'imdb_id', 'poster_path', 'wiki_link', 'title_y',
       'original_title', 'is_adult', 'year_of_release', 'runtime', 'genres',
       'imdb_rating', 'imdb_votes', 'story', 'summary', 'tagline', 'actors',
       'wins_nominations', 'release_date'],
      dtype='object')

In [5]:
movie.isnull().sum()

title_x                0
imdb_id                0
poster_path          750
wiki_link              0
title_y                0
original_title         0
is_adult               0
year_of_release        0
runtime                0
genres                 0
imdb_rating           13
imdb_votes            13
story                265
summary                1
tagline             3645
actors                10
wins_nominations    2986
release_date        1281
dtype: int64

In [6]:
movie['imdb_rating']

0       8.4
1       4.1
2       6.1
3       6.0
4       7.3
       ... 
4325    6.1
4326    6.2
4327    6.8
4328    7.0
4329    7.0
Name: imdb_rating, Length: 4330, dtype: float64

In [7]:
['imdb_id', 'original_title', 'genres','summary', 'tagline','story', 'actors', 'imdb_rating', 'year_of_release']

['imdb_id',
 'original_title',
 'genres',
 'summary',
 'tagline',
 'story',
 'actors',
 'imdb_rating',
 'year_of_release']

In [5]:
movie = movie[['imdb_id', 'original_title', 'genres','summary', 'story', 'actors', 'imdb_rating', 'year_of_release']]

In [100]:
movie.head(1)

Unnamed: 0,imdb_id,original_title,genres,summary,tagline,story,actors,imdb_rating,year_of_release
0,tt8291224,Uri: The Surgical Strike,Action|Drama|War,Indian army special forces execute a covert op...,,Divided over five chapters the film chronicle...,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,8.4,2019


In [9]:
movie['summary'][0]

'Indian army special forces execute a covert operation  avenging the killing of fellow army men at their base by a terrorist group.'

In [14]:
movie['story'][0]

'Divided over five chapters  the film chronicles the events of the surgical strike conducted by the Indian military against suspected militants in Pakistan occupied Kashmir. It tells the story of the 11 tumultuous events over which the operation was carried out. Indian army special forces carry out a covert operation to avenge the killing of fellow army men at their base by a terrorist group.'

In [107]:
movie.isnull().sum()

imdb_id              0
original_title       0
genres               0
summary              1
story              265
actors              10
imdb_rating         13
year_of_release      0
dtype: int64

In [10]:
len(movie)

4330

In [6]:
movie[movie['imdb_rating'].isnull()]

Unnamed: 0,imdb_id,original_title,genres,summary,story,actors,imdb_rating,year_of_release
67,tt7743400,Jhalki,Drama,A life-altering disappearance of her 7-year ol...,A life-altering disappearance of her 7-year ol...,Boman Irani|Tannishtha Chatterjee|Divya Dutta|...,,2019
68,tt9248934,Marjaavaan,Action|Romance,Marjaavaan is an upcoming Indian action romanc...,Marjaavaan is an upcoming Indian action romanc...,Riteish Deshmukh|Sidharth Malhotra|Tara Sutari...,,2019
69,tt9172840,Motichoor Chaknachoor,Comedy|Romance,A hilarious story of a 36-year-old jobless man...,A hilarious story of a 36-year-old jobless man...,Nawazuddin Siddiqui|Athiya Shetty|Vibha Chhibb...,,2019
70,tt8876008,Keep Safe Distance,Action|Thriller,Keep Safe Distance is an upcoming Indian thril...,Keep Safe Distance is an upcoming Indian thril...,Shahbaaz Khan|Adi Irani|Vikas Anand|Kiran Kuma...,,2019
71,tt9495690,Pagalpanti,Action|Comedy,A tourist group from India sets out on a vacat...,A tourist group from India sets out on a vacat...,Kriti Kharbanda|John Abraham|Ileana D'Cruz|Ani...,,2019
72,tt10196464,Ramprasad Ki Tehrvi,Comedy|Drama,Add a Plot »,,Naseeruddin Shah|Vikrant Massey|Konkona Sen Sh...,,\N
73,tt10443846,Yeh Saali Aashiqui,Thriller,Add a Plot »,,Vardhan Puri|Jessey Lever|,,2019
74,tt8110330,Dil Bechara,Drama|Romance,A love story about two cancer patients.,A love story about two cancer patients.,Sushant Singh Rajput|Sanjana Sanghi|Sahil Vaid|,,2019
75,tt9680136,Pati Patni Aur Woh,Comedy|Romance,Chintu Tyagi is an ordinary middle class man ...,Chintu Tyagi is an ordinary middle class man ...,Kartik Aaryan|Bhumi Pednekar|Ananya Panday|Apa...,,2019
76,tt8983168,Commando 3,Action|Thriller,Commando 3 is a Hindi movie starring Vidyut Ja...,A mysterious man is on an impending mission to...,Vidyut Jammwal|Adah Sharma|Angira Dhar|Sumeet ...,,2019


In [7]:
movie['imdb_rating'] = movie['imdb_rating'].apply(lambda x: np.random.uniform(6, 7) if pd.isnull(x) else x)

In [13]:
movie.isnull().sum()

imdb_id              0
original_title       0
genres               0
summary              1
story              265
actors              10
imdb_rating          0
year_of_release      0
dtype: int64

In [8]:
movie = movie[~movie['actors'].isnull()]

In [15]:
len(movie)

4320

In [16]:
movie['year_of_release']

0       2019
1       2019
2       2019
3       2019
4       2018
        ... 
4324    1950
4325    1950
4326    1950
4327    1950
4328    1950
Name: year_of_release, Length: 4320, dtype: object

In [9]:
movie['year_of_release'].replace('\\N', np.nan, inplace=True)

In [10]:
movie.isnull().sum()

imdb_id              0
original_title       0
genres               0
summary              0
story              259
actors               0
imdb_rating          0
year_of_release      1
dtype: int64

In [11]:
movie = movie[~movie['year_of_release'].isnull()]

In [12]:
len(movie)

4319

In [21]:
movie.isnull().sum()

imdb_id              0
original_title       0
genres               0
summary              0
story              258
actors               0
imdb_rating          0
year_of_release      0
dtype: int64

In [13]:
movie['year_of_release'] = pd.to_numeric(movie['year_of_release'])

In [14]:
movie = movie[movie['year_of_release'] >= 1980]

In [15]:
len(movie)

3091

In [25]:
movie.head(1)

Unnamed: 0,imdb_id,original_title,genres,summary,story,actors,imdb_rating,year_of_release
0,tt8291224,Uri: The Surgical Strike,Action|Drama|War,Indian army special forces execute a covert op...,Divided over five chapters the film chronicle...,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,8.4,2019


In [19]:
### Clean Genres and Actors

In [16]:
def separate(text):
    new_txt = re.sub(r'\|', ',', text)
    return new_txt.split(',')

In [17]:
movie['genres'] = movie['genres'].apply(separate)

In [18]:
movie['actors'] = movie['actors'].apply(separate)

In [19]:
movie['actors'][0]

['Vicky Kaushal',
 'Paresh Rawal',
 'Mohit Raina',
 'Yami Gautam',
 'Kirti Kulhari',
 'Rajit Kapoor',
 'Ivan Rodrigues',
 'Manasi Parekh',
 'Swaroop Sampat',
 'Riva Arora',
 'Yogesh Soman',
 'Fareed Ahmed',
 'Akashdeep Arora',
 'Kallol Banerjee',
 '']

In [20]:
def cast_name(items):
    # items = ast.literal_eval(items)
    cast_list = []
    count = 0
    for i in items:
        if count != 4:
            cast_list.append(i)
            count += 1
        else:
            break
    return cast_list

In [21]:
cast_name(movie['actors'][0])

['Vicky Kaushal', 'Paresh Rawal', 'Mohit Raina', 'Yami Gautam']

In [22]:
movie['actors'] = movie['actors'].apply(cast_name)

In [33]:
movie.head(1)

Unnamed: 0,imdb_id,original_title,genres,summary,story,actors,imdb_rating,year_of_release
0,tt8291224,Uri: The Surgical Strike,"[Action, Drama, War]",Indian army special forces execute a covert op...,Divided over five chapters the film chronicle...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",8.4,2019


In [23]:
movie['genres'][0]+movie['actors'][0]

['Action',
 'Drama',
 'War',
 'Vicky Kaushal',
 'Paresh Rawal',
 'Mohit Raina',
 'Yami Gautam']

In [24]:
movie['summary'] = movie['summary'].apply(lambda x : x.strip('.').split('.'))

In [25]:
dr = movie['summary'][206]
dr

['Add a Plot\xa0»']

In [26]:
movie = movie[~movie['story'].isnull()]

In [27]:
movie['story'] = movie['story'].apply(lambda x : x.strip('.').split('.'))

In [28]:
len(movie)

3003

In [29]:
movie.isnull().sum()

imdb_id            0
original_title     0
genres             0
summary            0
story              0
actors             0
imdb_rating        0
year_of_release    0
dtype: int64

In [41]:
movie.head(1)

Unnamed: 0,imdb_id,original_title,genres,summary,story,actors,imdb_rating,year_of_release
0,tt8291224,Uri: The Surgical Strike,"[Action, Drama, War]",[Indian army special forces execute a covert o...,[Divided over five chapters the film chronicl...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",8.4,2019


### Crew

In [42]:
crew.head(1)

Unnamed: 0,imdb_id,directors,writers
0,tt0042184,nm0025608,nm0025608|nm0324690


In [43]:
crew_data.head(1)

Unnamed: 0,crew_id,name,born_year,death_year,profession,known_for
0,nm0001408,Shekhar Kapur,1945,\N,actor|director|producer,tt0240510|tt0414055|tt0109206|tt0127536


In [30]:
crew_data = crew_data[['crew_id', 'name']]

In [31]:
crew = pd.merge(crew, crew_data, left_on='directors', right_on='crew_id')

In [46]:
crew.head(1)

Unnamed: 0,imdb_id,directors,writers,crew_id,name
0,tt0042184,nm0025608,nm0025608|nm0324690,nm0025608,Chetan Anand


In [32]:
crew.drop('directors', axis=1, inplace=True)

In [33]:
crew.drop('crew_id', axis=1, inplace=True)

In [34]:
crew.drop('writers', axis=1, inplace=True)

In [35]:
crew.head(1)

Unnamed: 0,imdb_id,name
0,tt0042184,Chetan Anand


In [36]:
len(crew)

4423

In [37]:
crew['name'] = crew['name'].apply(lambda x : x.split(','))

In [38]:
crew['name'].tail(10)

4413                 [Jagbir Dahiya]
4414              [Debamitra Biswal]
4415            [Naman Nitin Mukesh]
4416              [Raaj Shaandilyaa]
4417                  [Jagan Shakti]
4418                [Abhishek Dixit]
4419    [Brijesh Batuknath Tripathi]
4420                [Mitali Ghoshal]
4421                  [Ravi Sadasiv]
4422                   [Aarun Nagar]
Name: name, dtype: object

In [39]:
data = pd.merge(movie, crew, on='imdb_id')

In [40]:
data.head(1)

Unnamed: 0,imdb_id,original_title,genres,summary,story,actors,imdb_rating,year_of_release,name
0,tt8291224,Uri: The Surgical Strike,"[Action, Drama, War]",[Indian army special forces execute a covert o...,[Divided over five chapters the film chronicl...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",8.4,2019,[Aditya Dhar]


In [41]:
len(movie)

3003

In [59]:
len(crew)

4423

In [42]:
len(data)

2888

In [43]:
import requests

def get_movie_id(api_key, movie_name):
    base_url = "https://api.themoviedb.org/3/search/movie"
    params = {
        'api_key': api_key,
        'query': movie_name
    }

    try:
        response = requests.get(base_url, params=params)
        data = response.json()
        
        # Assuming the first result is the most relevant one
        movie_id = data['results'][0]['id'] if data['results'] else None
        return movie_id
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None

# Replace 'YOUR_API_KEY' with your actual TMDB API key
api_key = 'a2beae1401c2d48af273258951fe0f1c'
movie_name = 'Uri: The Surgical Strike'  # Replace with the movie name you're searching for

movie_id = get_movie_id(api_key, movie_name)

if movie_id:
    print(f"The TMDB ID for '{movie_name}' is {movie_id}")
else:
    print(f"Unable to find the TMDB ID for '{movie_name}'")


The TMDB ID for 'Uri: The Surgical Strike' is 554600


In [63]:
data['id'] = 0

In [65]:
data.head(1)

Unnamed: 0,imdb_id,original_title,genres,summary,story,actors,imdb_rating,year_of_release,name,id
0,tt8291224,Uri: The Surgical Strike,"[Action, Drama, War]",[Indian army special forces execute a covert o...,[Divided over five chapters the film chronicl...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",8.4,2019,[Aditya Dhar],0


In [73]:
# api_key = 'a2beae1401c2d48af273258951fe0f1c'

# for i in range(len(data)):

#     movie_name = data['original_title'][i]

#     movie_id = get_movie_id(api_key, movie_name)
#     data['id'][i] = movie_id

In [45]:
ids.head()

Unnamed: 0,imdb_id,id
0,tt8291224,554600
1,tt9472208,576152
2,tt6986710,460868
3,tt8108208,569584
4,tt6028796,512188


In [47]:
data = data.merge(ids , on='imdb_id')

In [48]:
data.head()

Unnamed: 0,imdb_id,original_title,genres,summary,story,actors,imdb_rating,year_of_release,name,id
0,tt8291224,Uri: The Surgical Strike,"[Action, Drama, War]",[Indian army special forces execute a covert o...,[Divided over five chapters the film chronicl...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",8.4,2019,[Aditya Dhar],554600
1,tt9472208,Battalion 609,[War],[The story of Battalion 609 revolves around a ...,[The story revolves around a cricket match bet...,"[Vicky Ahuja, Shoaib Ibrahim, Shrikant Kamat, ...",4.1,2019,[Brijesh Batuknath Tripathi],576152
2,tt6986710,The Accidental Prime Minister,"[Biography, Drama]",[Explores Manmohan Singh's tenure as the Prime...,[Based on the memoir by Indian policy analyst ...,"[Anupam Kher, Akshaye Khanna, Aahana Kumra, At...",6.1,2019,[Vijay Ratnakar Gutte],460868
3,tt8108208,Why Cheat India,"[Crime, Drama]",[The movie focuses on existing malpractices in...,[The movie focuses on existing malpractices in...,"[Emraan Hashmi, Shreya Dhanwanthary, Snighdade...",6.0,2019,[Soumik Sen],569584
4,tt6028796,Evening Shadows,[Drama],[Under the 'Evening Shadows' truth often play...,[While gay rights and marriage equality has be...,"[Mona Ambegaonkar, Ananth Narayan Mahadevan, D...",7.3,2018,[Sridhar Rangayan],512188


In [49]:
data = data[~data['id'].isnull()]

In [50]:
len(data)

2922

In [51]:
data['id'] = data['id'].apply(lambda x : int(x))

In [84]:
id = data[['imdb_id', 'id']]

In [86]:
id.to_csv('./indian_movies/ids.csv' , index=False)

In [52]:
data.columns

Index(['imdb_id', 'original_title', 'genres', 'summary', 'story', 'actors',
       'imdb_rating', 'year_of_release', 'name', 'id'],
      dtype='object')

In [53]:
data.columns = ['imdbid', 'title', 'genres', 'overview', 'story', 'cast', 'vote_average', 'year_of_release', 'crew', 'id']

In [54]:
data.head(1)

Unnamed: 0,imdbid,title,genres,overview,story,cast,vote_average,year_of_release,crew,id
0,tt8291224,Uri: The Surgical Strike,"[Action, Drama, War]",[Indian army special forces execute a covert o...,[Divided over five chapters the film chronicl...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",8.4,2019,[Aditya Dhar],554600


In [55]:
data['imdbid'] = data['imdbid'].apply(lambda x: x[2:])

In [56]:
data.head(2)

Unnamed: 0,imdbid,title,genres,overview,story,cast,vote_average,year_of_release,crew,id
0,8291224,Uri: The Surgical Strike,"[Action, Drama, War]",[Indian army special forces execute a covert o...,[Divided over five chapters the film chronicl...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",8.4,2019,[Aditya Dhar],554600
1,9472208,Battalion 609,[War],[The story of Battalion 609 revolves around a ...,[The story revolves around a cricket match bet...,"[Vicky Ahuja, Shoaib Ibrahim, Shrikant Kamat, ...",4.1,2019,[Brijesh Batuknath Tripathi],576152


In [57]:
appdata = data.copy()

In [95]:
data['genres'] = data['genres'].apply(lambda x : [y.replace(' ','') for y in x])

In [96]:
data['cast'] = data['cast'].apply(lambda x : [y.replace(' ','') for y in x])

In [97]:
data['crew'] = data['crew'].apply(lambda x : [y.replace(' ','') for y in x])

In [98]:
data.head(1)

Unnamed: 0,imdbid,title,genres,overview,story,cast,vote_average,year_of_release,crew,id
0,8291224,Uri: The Surgical Strike,"[Action, Drama, War]",[Indian army special forces execute a covert o...,[Divided over five chapters the film chronicl...,"[VickyKaushal, PareshRawal, MohitRaina, YamiGa...",8.4,2019,[AdityaDhar],554600


In [100]:
data['tokens'] = data['cast'] + data['crew'] + data['genres'] + data['overview']

In [101]:
data.head(1)

Unnamed: 0,imdbid,title,genres,overview,story,cast,vote_average,year_of_release,crew,id,tokens
0,8291224,Uri: The Surgical Strike,"[Action, Drama, War]",[Indian army special forces execute a covert o...,[Divided over five chapters the film chronicl...,"[VickyKaushal, PareshRawal, MohitRaina, YamiGa...",8.4,2019,[AdityaDhar],554600,"[VickyKaushal, PareshRawal, MohitRaina, YamiGa..."


In [102]:
data['tokens'][155]

['SunilGrover',
 'ZakirHussain',
 'DipannitaSharma',
 'RajeshSharma',
 'VishalMishra',
 'Comedy',
 'A celebrated journalist interviews one of the world\'s most dreaded terrorists  "D"']

In [103]:
data['title'][155]

'Coffee with D'

In [104]:
data['tokens'] = data['tokens'].apply(lambda x : ' '.join(x))

In [105]:
data['tokens'] = data['tokens'].apply(lambda x : x.lower())

In [106]:
data['tokens'][155]

'sunilgrover zakirhussain dipannitasharma rajeshsharma vishalmishra comedy a celebrated journalist interviews one of the world\'s most dreaded terrorists  "d"'

In [107]:
X = data['tokens']

In [108]:
X.head()

0    vickykaushal pareshrawal mohitraina yamigautam...
1    vickyahuja shoaibibrahim shrikantkamat elenaka...
2    anupamkher akshayekhanna aahanakumra atulsharm...
3    emraanhashmi shreyadhanwanthary snighdadeepcha...
4    monaambegaonkar ananthnarayanmahadevan devansh...
Name: tokens, dtype: object

In [109]:
def clean_text(token):
    
    token = re.sub(r'\[[0-9]*\]', ' ',token)
    token = re.sub(r'\s+', ' ', token)
    token = re.sub('[^a-zA-Z]', ' ', token )
    token = re.sub(r'\s+', ' ', token)
    
    return token

In [110]:
X = X.apply(clean_text)

In [111]:
word = word_tokenize
stemmer = PorterStemmer()

def stem(token):
    token = word(token)
    token = [stemmer.stem(x) for x in token]
    return ' '.join(token)

In [112]:
X = X.apply(stem)

In [113]:
tfidf = TfidfVectorizer(max_features=10000 , stop_words='english')

In [114]:
vectors = tfidf.fit_transform(X).toarray()

In [115]:
vectors.shape

(2776, 10000)

In [116]:
similarity = cosine_similarity(vectors)

In [117]:
similarity.shape

(2776, 2776)

In [118]:
def recommender(name):

    index = data[data['title'] == name].index[0]
    top10 = sorted(enumerate(similarity[index]) , reverse=True , key= lambda x : x[1])[1:11]
    
    for i in top10:
        movie = data.iloc[i[0]].title
        print(movie)

In [119]:
def your_movie(name):
    
    name = name.lower()
    new_title = [re.sub(r'[^\w\s]', '', title.lower()) for title in data['title']]

    if name in new_title:
        index = new_title.index(name)
        recommender(data.iloc[index].title)
    else:
        print(f'No movie available for "{name}"')

In [121]:
your_movie('Judwaa')

Wanted: Dead or Alive
Oonche Log
Mohabbat Ki Kasam
Aulad Ke Dushman
Dharamyudh
Pyaar Tune Kya Kiya...
Waqt Ki Deewar
Aulad
Megha
Take It Easy


In [58]:
data_dict = appdata.to_dict()

In [59]:
pickle.dump(data_dict , open('deployment/movie_dict.pkl', 'wb'))

In [124]:
pickle.dump(similarity , open('deployment/similarity.pkl', 'wb'))

# Now go to Deployment