In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
# Reading bollywood movie datasets from year 1950 to 2019.
bollywood = pd.concat([
    pd.read_csv('datasets/bollywood_1950-1989.csv'),
    pd.read_csv('datasets/bollywood_1990-2009.csv'),
    pd.read_csv('datasets/bollywood_2010-2019.csv')
])

bollywood_meta = pd.concat([
    pd.read_csv('datasets/bollywood_meta_1950-1989.csv'),
    pd.read_csv('datasets/bollywood_meta_1990-2009.csv'),
    pd.read_csv('datasets/bollywood_meta_2010-2019.csv')
])

bollywood_ratings = pd.concat([
    pd.read_csv('datasets/bollywood_ratings_1950-1989.csv'),
    pd.read_csv('datasets/bollywood_ratings_1990-2009.csv'),
    pd.read_csv('datasets/bollywood_ratings_2010-2019.csv')
])

bollywood_text = pd.concat([
    pd.read_csv('datasets/bollywood_text_1950-1989.csv'),
    pd.read_csv('datasets/bollywood_text_1990-2009.csv'),
    pd.read_csv('datasets/bollywood_text_2010-2019.csv')
])

In [3]:
# Merging datasets to form one dataframe.
movies_data = pd.merge(bollywood, bollywood_meta, on='imdb_id', how='inner')
movies_data = pd.merge(movies_data, bollywood_ratings, on='imdb_id', how='inner')
movies_data = pd.merge(movies_data, bollywood_text, on='imdb_id', how='inner')

In [4]:
# Dropping all the duplicate records.
movies_data.drop_duplicates(subset="title_x", keep = 'first', inplace = True)

In [5]:
# Preparing dataframe with necessary columns required for info about movies.
movie_details = pd.DataFrame({
    'title': movies_data['title_x'],
    'genres': movies_data['genres'],
    'actors': movies_data['actors'],
    'release_date': movies_data['release_date'],
    'imdb_rating': movies_data['imdb_rating'],
    'summary': movies_data['summary'],
    'poster_path': movies_data['poster_path'],
    'wiki_link': movies_data['wiki_link']
})

In [6]:
# Dropping the records which have null values.
movie_details = movie_details.dropna(axis=0, how='any')

In [7]:
movie_details['actors'] = movie_details['actors'].apply(lambda x : x[:-1])
movie_details['genres'] = movie_details['genres'].str.replace('|', ' | ', regex = True)
movie_details['actors'] = movie_details['actors'].str.replace('|', ' | ', regex = True)

In [8]:
movie_details.shape

(2669, 8)

In [9]:
movie_details.head(5)

Unnamed: 0,title,genres,actors,release_date,imdb_rating,summary,poster_path,wiki_link
16,Ram Lakhan,Action | Drama | Musical,Rakhee Gulzar | Jackie Shroff | Anil Kapoor | Dimple Kapadia | Madhuri Dixit | Gulshan Grover | Amrish Puri | Paresh Rawal | Anupam Kher | Saeed Jaffrey | Raza Murad | Dalip Tahil | Annu Kapoor | Satish Kaushik,27 January 1989 (India),6.8,Sharda (Raakhee) vows vengeance when her husband is murdered by his two evil cousins Bhishamber (Amrish Puri) and Bhanu (Paresh Rawal) and she and her two young sons are thrown out on the...,https://upload.wikimedia.org/wikipedia/en/thumb/d/d4/Ram_Lakhan_poster.jpg/220px-Ram_Lakhan_poster.jpg,https://en.wikipedia.org/wiki/Ram_Lakhan
18,Tridev,Action | Crime | Drama,Naseeruddin Shah | Sunny Deol | Jackie Shroff | Madhuri Dixit | Sonam | Sangeeta Bijlani | Anupam Kher | Amrish Puri | Raza Murad | Dalip Tahil | Sharat Saxena | Tej Sapru | Dan Dhanoa | Rajesh Vivek,7 July 1989 (India),6.3,A honest but disgraced Police Inspector's attempts to clear his name pits him against terrorists and the brother of his sweetheart.,https://upload.wikimedia.org/wikipedia/en/thumb/4/49/Tridevfilm.jpg/220px-Tridevfilm.jpg,https://en.wikipedia.org/wiki/Tridev
19,ChaalBaaz,Action | Comedy | Drama,Sridevi | Sunny Deol | Rajinikanth | Anupam Kher | Shakti Kapoor | Annu Kapoor | Saeed Jaffrey | Aruna Irani | Rohini Hattangadi | Aftab Shivdasani,8 December 1989 (India),6.7,Twins separated at infancy are brought up differently. One weak one strong. One day they end up at each others house. Their life is not the same anymore.,https://upload.wikimedia.org/wikipedia/en/thumb/e/e2/ChaalBaaz.jpg/220px-ChaalBaaz.jpg,https://en.wikipedia.org/wiki/ChaalBaaz
20,Batwara,Action | Drama,Dharmendra | Vinod Khanna | Dimple Kapadia | Poonam Dhillon | Amrita Singh | Amrish Puri | Shammi Kapoor | Vijayendra Ghatge | Neena Gupta | Asha Parekh | Kulbhushan Kharbanda,14 July 1989 (India),6.5,In India circa after the British Rule there is a princely family consisting of Bade Thakur (Shammi Kapoor) his sons Devan (Vijayendra Ghatge) Vikram Singh (Vinod Khanna) and a third son...,https://upload.wikimedia.org/wikipedia/en/thumb/6/6e/Batwara_poster.jpg/220px-Batwara_poster.jpg,https://en.wikipedia.org/wiki/Batwara
23,Bhrashtachar,Action | Crime | Drama,Rekha | Mithun Chakraborty | Anupam Kher | Raza Murad | Anjana Mumtaz | Abhinav Chaturvedi | Shilpa Shirodkar | Rajinikanth | Padma Khanna | Sudhir Pandey | Vinod Nagpal | Girja Shankar | Vikas Anand | Bharti Achrekar,22 November 1989 (India),4.7,Bhrastachar tells the story of a disparate group of characters whose lives overlap- Bhavani a journalist fighting injustice and corruption; Janki a widow who becomes the mistress of local ...,https://upload.wikimedia.org/wikipedia/en/thumb/4/49/Bhrashtachar.jpg/220px-Bhrashtachar.jpg,https://en.wikipedia.org/wiki/Bhrashtachar


In [10]:
# Saving prepared movie details.
movie_details.to_csv('prepared_data/movie_details.csv', index=False)

In [11]:
# Preparing dataframe with necessary columns required to predict similar movies.
movie_pre_info = pd.DataFrame({
    'title': movies_data['title_x'],
    'genres': movies_data['genres'],
    'actors': movies_data['actors'],
    'imdb_rating': movies_data['imdb_rating'],
    'year_of_release': movies_data['year_of_release']
})

In [12]:
# Dropping the records which have null values.
movie_pre_info = movie_pre_info.dropna(axis=0, how='any')

In [13]:
# Lowering the case.
movie_pre_info['genres'] = movie_pre_info['genres'].str.lower()
movie_pre_info['actors'] = movie_pre_info['actors'].str.lower()

In [14]:
movie_pre_info.shape

(4265, 5)

In [15]:
movie_pre_info.head(5)

Unnamed: 0,title,genres,actors,imdb_rating,year_of_release
0,Aag Ka Gola,action|drama,sunny deol|dimple kapadia|archana puran singh|shakti kapoor|prem chopra|om shivpuri|raza murad|anjana mumtaz|mahesh anand|sharat saxena|atlee brar|gurbachan singh|bob christo|jagdish raj|,5.4,1990
16,Ram Lakhan,action|drama|musical,rakhee gulzar|jackie shroff|anil kapoor|dimple kapadia|madhuri dixit|gulshan grover|amrish puri|paresh rawal|anupam kher|saeed jaffrey|raza murad|dalip tahil|annu kapoor|satish kaushik|,6.8,1989
17,Asmaan Se Ooncha,action|drama|family,jeetendra|raj babbar|anita raj|govinda|sonam|sujit kumar|sadashiv amrapurkar|ajitesh|dev kumar|mac mohan|jagdish raj|jaya mathur|coca cola|k.k. raj|,5.0,1989
18,Tridev,action|crime|drama,naseeruddin shah|sunny deol|jackie shroff|madhuri dixit|sonam|sangeeta bijlani|anupam kher|amrish puri|raza murad|dalip tahil|sharat saxena|tej sapru|dan dhanoa|rajesh vivek|,6.3,1989
19,ChaalBaaz,action|comedy|drama,sridevi|sunny deol|rajinikanth|anupam kher|shakti kapoor|annu kapoor|saeed jaffrey|aruna irani|rohini hattangadi|aftab shivdasani|,6.7,1989


In [16]:
# Saving prepared data.
movie_pre_info.to_csv('prepared_data/movie_pre_info.csv', index=False)