### Dataset: The Movies Dataset

[Click here to access the dataset on Kaggle](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset)

#### Description:
This dataset contains metadata for thousands of movies, including genres, cast, crew, keywords, production companies, and user ratings.


In [9]:
import pandas as pd
import numpy as np
from ast import literal_eval

import warnings
warnings.filterwarnings('ignore')

In [39]:
movies_meta = pd.read_csv('large_dataset/movies_metadata.csv')
credits = pd.read_csv('large_dataset/credits.csv')

ratings = pd.read_csv('large_dataset/ratings.csv')

In [49]:
movies_meta['title'] = movies_meta['title'] + ' '+ movies_meta['release_date'].fillna('NA').apply(lambda x: x[:4])

movies_meta['genres'] = movies_meta['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies_meta['genres'] = movies_meta['genres'].apply(lambda x: 'genres : ' + ' '.join(x))

In [51]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

credits['director'] = credits['crew'].apply(literal_eval).apply(get_director)
credits['cast'] = credits['cast'].apply(literal_eval).apply(lambda x: [x[i]['name'] for i in range(min(len(x), 3))] if isinstance(x, list) else ['Not Available/'])

In [53]:
movies_meta = movies_meta[['id','title','genres','overview','vote_average','vote_count']]

movies_meta['id'] = pd.to_numeric(movies_meta['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
credits['cast'] = credits['cast'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

movies_meta = pd.merge(left = movies_meta, right = credits[['cast','director','id']], on='id', how='left').dropna().drop_duplicates()

In [18]:
movies_meta['movie_genra_crew'] = movies_meta['genres'] + '. Director: ' + movies_meta['director'] + '. Cast: ' + movies_meta['cast'].apply( lambda x: ' '.join(x) if isinstance(x, list) else '')
movies_meta['movie_genra_crew'] = movies_meta['movie_genra_crew'].fillna('').astype(str)

movies_meta['overview'] = movies_meta['overview'].fillna('No desscription').astype(str)
ratings = ratings[ratings['movieId'].isin(movies_meta['id'])]

In [None]:
mid_to_idx = {mid:idx for idx, mid in enumerate(movies_meta['id'].unique())}
movies_meta['id'] = movies_meta['id'].apply(lambda x: mid_to_idx[x])
ratings['movieId'] = ratings['movieId'].apply(lambda x: mid_to_idx[x])

userid_to_idx = {uid:idx for idx, uid in enumerate(ratings['userId'].unique())}
ratings['userId'] = ratings['userId'].apply(lambda x: userid_to_idx[x])

In [74]:
movies_meta.to_csv('movies_data.csv', index=False)
ratings.to_csv('ratings_data.csv', index=False)