In [1]:
import pandas as pd

In [2]:
# Reading bollywood movie datasets from year 1950 to 2019.
bollywood = pd.concat([
    pd.read_csv('datasets/bollywood_1950-1989.csv'),
    pd.read_csv('datasets/bollywood_1990-2009.csv'),
    pd.read_csv('datasets/bollywood_2010-2019.csv')
])

bollywood_meta = pd.concat([
    pd.read_csv('datasets/bollywood_meta_1950-1989.csv'),
    pd.read_csv('datasets/bollywood_meta_1990-2009.csv'),
    pd.read_csv('datasets/bollywood_meta_2010-2019.csv')
])

bollywood_ratings = pd.concat([
    pd.read_csv('datasets/bollywood_ratings_1950-1989.csv'),
    pd.read_csv('datasets/bollywood_ratings_1990-2009.csv'),
    pd.read_csv('datasets/bollywood_ratings_2010-2019.csv')
])

bollywood_text = pd.concat([
    pd.read_csv('datasets/bollywood_text_1950-1989.csv'),
    pd.read_csv('datasets/bollywood_text_1990-2009.csv'),
    pd.read_csv('datasets/bollywood_text_2010-2019.csv')
])

In [3]:
# Merging datasets to form one dataframe.
movies_data = pd.merge(bollywood, bollywood_meta, on='imdb_id', how='inner')
movies_data = pd.merge(movies_data, bollywood_ratings, on='imdb_id', how='inner')
movies_data = pd.merge(movies_data, bollywood_text, on='imdb_id', how='inner')

In [4]:
# Dropping all the duplicate records.
movies_data.drop_duplicates(subset="title_x", keep = 'first', inplace = True)

In [5]:
# Preparing dataframe with necessary columns required for info about movies.
movie_details = pd.DataFrame({
    'title': movies_data['title_x'],
    'genres': movies_data['genres'],
    'actors': movies_data['actors'],
    'release_date': movies_data['release_date'],
    'imdb_rating': movies_data['imdb_rating'],
    'summary': movies_data['summary'],
    'poster_path': movies_data['poster_path'],
    'wiki_link': movies_data['wiki_link']
})

In [6]:
# Dropping the records which have null values.
movie_details = movie_details.dropna(axis=0, how='any')

In [7]:
movie_details.shape

(2669, 8)

In [8]:
movie_details.head(5)

Unnamed: 0,title,genres,actors,release_date,imdb_rating,summary,poster_path,wiki_link
16,Ram Lakhan,Action|Drama|Musical,Rakhee Gulzar|Jackie Shroff|Anil Kapoor|Dimple...,27 January 1989 (India),6.8,Sharda (Raakhee) vows vengeance when her husba...,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Ram_Lakhan
18,Tridev,Action|Crime|Drama,Naseeruddin Shah|Sunny Deol|Jackie Shroff|Madh...,7 July 1989 (India),6.3,A honest but disgraced Police Inspector's atte...,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Tridev
19,ChaalBaaz,Action|Comedy|Drama,Sridevi|Sunny Deol|Rajinikanth|Anupam Kher|Sha...,8 December 1989 (India),6.7,Twins separated at infancy are brought up diff...,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/ChaalBaaz
20,Batwara,Action|Drama,Dharmendra|Vinod Khanna|Dimple Kapadia|Poonam ...,14 July 1989 (India),6.5,In India circa after the British Rule there i...,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Batwara
23,Bhrashtachar,Action|Crime|Drama,Rekha|Mithun Chakraborty|Anupam Kher|Raza Mura...,22 November 1989 (India),4.7,Bhrastachar tells the story of a disparate gro...,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Bhrashtachar


In [9]:
# Saving prepared movie details.
movie_details.to_csv('prepared_data/movie_details.csv', index=False)

In [10]:
# Preparing dataframe with necessary columns required to predict similar movies.
movie_pre_info = pd.DataFrame({
    'title': movies_data['title_x'],
    'genres': movies_data['genres'],
    'actors': movies_data['actors'],
    'year_of_release': movies_data['year_of_release']
})

In [11]:
# Dropping the records which have null values.
movie_pre_info = movie_pre_info.dropna(axis=0, how='any')

In [12]:
# Lowering the case.
movie_pre_info['genres'] = movie_pre_info['genres'].str.lower()
movie_pre_info['actors'] = movie_pre_info['actors'].str.lower()

In [13]:
movie_pre_info.shape

(4278, 4)

In [14]:
movie_pre_info.head(5)

Unnamed: 0,title,genres,actors,year_of_release
0,Aag Ka Gola,action|drama,sunny deol|dimple kapadia|archana puran singh|...,1990
16,Ram Lakhan,action|drama|musical,rakhee gulzar|jackie shroff|anil kapoor|dimple...,1989
17,Asmaan Se Ooncha,action|drama|family,jeetendra|raj babbar|anita raj|govinda|sonam|s...,1989
18,Tridev,action|crime|drama,naseeruddin shah|sunny deol|jackie shroff|madh...,1989
19,ChaalBaaz,action|comedy|drama,sridevi|sunny deol|rajinikanth|anupam kher|sha...,1989


In [15]:
# Saving prepared data.
movie_pre_info.to_csv('prepared_data/movie_pre_info.csv', index=False)