## Importing libraries

In [877]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Reading in datasets

In [878]:
links_df = pd.read_csv("data/links.csv")
movies_df = pd.read_csv("data/movies.csv")
ratings_df = pd.read_csv("data/ratings.csv")
tags_df = pd.read_csv("data/tags.csv")

# External Datasets
cast_df = pd.read_csv("data/cast.csv")
movie_info_df = pd.read_csv("data/movie_info.csv")

# Cleaning Movies Dataframe

In [879]:
movies_df1 = movies_df.copy()
movies_df1

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


### Separate genres and encode them

In [880]:
def clean_movies(movies_df):
    cleaned_movies_df = movies_df.copy()
    genre_mapping = {}
    def split_and_populate_mapping(genres):
        genre_list = genres.split('|')
        for genre in genre_list:
            if genre not in genre_mapping and genre != '(no genres listed)':
                genre_mapping[genre] = len(genre_mapping)
    cleaned_movies_df['genres'].apply(split_and_populate_mapping)
    
    def vectorize_generes(genres):
        one_hot_genres_arr = np.zeros(len(genre_mapping))
        for genre in genres.split('|'):
            if genre in genre_mapping:
                one_hot_genres_arr[genre_mapping[genre]] = 1
        return one_hot_genres_arr
    cleaned_movies_df['genres_vectorized'] = cleaned_movies_df['genres'].apply(vectorize_generes)
    cleaned_movies_df['genres_list'] = cleaned_movies_df['genres'].str.split('|')
#     for genre in genre_mapping.keys():
#         cleaned_movies_df[genre] = cleaned_movies_df['genres'].apply(lambda genre_str: 1 if genre in genre_str else 0)
    
    return cleaned_movies_df

In [881]:
movies_df1 = clean_movies(movies_df1)
movies_df1

Unnamed: 0,movieId,title,genres,genres_vectorized,genres_list
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy]
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation, Comedy, Fantasy]"
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Animation, Comedy, Fantasy]"
9739,193585,Flint (2017),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",[Drama]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation]"


### Extracts the year in the title to a new column

In [882]:
movies_df1['year'] = movies_df1.title.str.extract(r"\(([0-9]{4})\)")
movies_df1

Unnamed: 0,movieId,title,genres,genres_vectorized,genres_list,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Comedy, Romance]",1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II (1995),Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1995
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation, Comedy, Fantasy]",2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Animation, Comedy, Fantasy]",2017
9739,193585,Flint (2017),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",[Drama],2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation]",2018


### Removes year in the title

In [883]:
movies_df1['title'] = movies_df1['title'].str[:-7]
movies_df1

Unnamed: 0,movieId,title,genres,genres_vectorized,genres_list,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,Adventure|Children|Fantasy,"[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,Comedy|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Comedy, Romance]",1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1995
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation, Comedy, Fantasy]",2017
9738,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Animation, Comedy, Fantasy]",2017
9739,193585,Flint,Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",[Drama],2017
9740,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation]",2018


In [884]:
movies_df1['year'] = pd.to_numeric(movies_df1['year'])
movies_df1

Unnamed: 0,movieId,title,genres,genres_vectorized,genres_list,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,2,Jumanji,Adventure|Children|Fantasy,"[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Children, Fantasy]",1995.0
2,3,Grumpier Old Men,Comedy|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Comedy, Romance]",1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[Comedy, Drama, Romance]",1995.0
4,5,Father of the Bride Part II,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1995.0
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation, Comedy, Fantasy]",2017.0
9738,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Animation, Comedy, Fantasy]",2017.0
9739,193585,Flint,Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",[Drama],2017.0
9740,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation]",2018.0


In [885]:
cleaned_movies_df = movies_df1
cleaned_movies_df

Unnamed: 0,movieId,title,genres,genres_vectorized,genres_list,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,2,Jumanji,Adventure|Children|Fantasy,"[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Children, Fantasy]",1995.0
2,3,Grumpier Old Men,Comedy|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Comedy, Romance]",1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[Comedy, Drama, Romance]",1995.0
4,5,Father of the Bride Part II,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1995.0
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation, Comedy, Fantasy]",2017.0
9738,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Animation, Comedy, Fantasy]",2017.0
9739,193585,Flint,Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",[Drama],2017.0
9740,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation]",2018.0


### Combines 'The' with the title

In [886]:
# movies_df2 = movies_df1.copy()
# movies_df2['title'].iloc[28]

In [887]:
# print(movies_df2['title'].iloc[28].split(','))
# len(movies_df2['title'].iloc[28].split(','))

### Removes alternative names in the title

In [888]:
# movie_title = movies_df1['title'].str.split('(')
# print(movie_title.head(30))
# movie_title[29][0].rstrip()
# movies_df1['title'] = movie_title.apply(lambda x: ','.join(map(str, x)))
# movies_df1

In [889]:
# movies_df1['title'].iloc[29]

# Merging Datasets

In [890]:
movie_info_df1 = movie_info_df[['budget', 'runtime', 'tmdbId']]
movies_links_df = cleaned_movies_df.merge(links_df)
movie_data = movies_links_df.merge(movie_info_df1)
movie_data

Unnamed: 0,movieId,title,genres,genres_vectorized,genres_list,year,imdbId,tmdbId,budget,runtime
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,114709,862.0,30000000,81
1,2,Jumanji,Adventure|Children|Fantasy,"[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Children, Fantasy]",1995.0,113497,8844.0,65000000,104
2,3,Grumpier Old Men,Comedy|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Comedy, Romance]",1995.0,113228,15602.0,0,101
3,4,Waiting to Exhale,Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[Comedy, Drama, Romance]",1995.0,114885,31357.0,16000000,127
4,5,Father of the Bride Part II,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1995.0,113041,11862.0,0,106
...,...,...,...,...,...,...,...,...,...,...
9628,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation, Comedy, Fantasy]",2017.0,5476944,432131.0,0,100
9629,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Animation, Comedy, Fantasy]",2017.0,5914996,445030.0,0,106
9630,193585,Flint,Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",[Drama],2017.0,6397426,479308.0,0,96
9631,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation]",2018.0,8391976,483455.0,0,90


In [891]:
ratings_df1 = ratings_df[['movieId', 'rating']]
avg_ratings_df = ratings_df1.groupby(by='movieId').mean()
avg_ratings_df

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.920930
2,3.431818
3,3.259615
4,2.357143
5,3.071429
...,...
193581,4.000000
193583,3.500000
193585,3.500000
193587,3.500000


In [892]:
movie_data = movie_data.merge(avg_ratings_df, on='movieId')
movie_data

Unnamed: 0,movieId,title,genres,genres_vectorized,genres_list,year,imdbId,tmdbId,budget,runtime,rating
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,114709,862.0,30000000,81,3.920930
1,2,Jumanji,Adventure|Children|Fantasy,"[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Children, Fantasy]",1995.0,113497,8844.0,65000000,104,3.431818
2,3,Grumpier Old Men,Comedy|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Comedy, Romance]",1995.0,113228,15602.0,0,101,3.259615
3,4,Waiting to Exhale,Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[Comedy, Drama, Romance]",1995.0,114885,31357.0,16000000,127,2.357143
4,5,Father of the Bride Part II,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1995.0,113041,11862.0,0,106,3.071429
...,...,...,...,...,...,...,...,...,...,...,...
9610,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation, Comedy, Fantasy]",2017.0,5476944,432131.0,0,100,4.000000
9611,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Animation, Comedy, Fantasy]",2017.0,5914996,445030.0,0,106,3.500000
9612,193585,Flint,Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",[Drama],2017.0,6397426,479308.0,0,96,3.500000
9613,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Animation]",2018.0,8391976,483455.0,0,90,3.500000


In [893]:
cast_data = cast_df[['gender', 'name', 'movieId']]
cast_data

Unnamed: 0,gender,name,movieId
0,2.0,Tom Hanks,1.0
1,2.0,Tim Allen,1.0
2,2.0,Don Rickles,1.0
3,2.0,Jim Varney,1.0
4,2.0,Wallace Shawn,1.0
...,...,...,...
266363,0.0,Alex Thaler,193609.0
266364,1.0,Sharon Bialy,193609.0
266365,1.0,Debi Manwiller,193609.0
266366,1.0,Mary Margiotta,193609.0


In [894]:
data = cast_data.merge(movie_data)
data = data[['gender', 'name', 'movieId', 'title', 'genres', 'genres_vectorized', 'genres_list', 'year', 'budget', 'runtime', 'rating']]
data

Unnamed: 0,gender,name,movieId,title,genres,genres_vectorized,genres_list,year,budget,runtime,rating
0,2.0,Tom Hanks,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
1,2.0,Tim Allen,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
2,2.0,Don Rickles,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
3,2.0,Jim Varney,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
4,2.0,Wallace Shawn,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
...,...,...,...,...,...,...,...,...,...,...,...
265922,0.0,Alex Thaler,193609.0,Andrew Dice Clay: Dice Rules,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1991.0,0,85,4.00000
265923,1.0,Sharon Bialy,193609.0,Andrew Dice Clay: Dice Rules,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1991.0,0,85,4.00000
265924,1.0,Debi Manwiller,193609.0,Andrew Dice Clay: Dice Rules,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1991.0,0,85,4.00000
265925,1.0,Mary Margiotta,193609.0,Andrew Dice Clay: Dice Rules,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1991.0,0,85,4.00000


### Check which values are NaN and drop them

In [895]:
data[data['year'].isna()]

Unnamed: 0,gender,name,movieId,title,genres,genres_vectorized,genres_list,year,budget,runtime,rating
247102,2.0,Tye Sheridan,140956.0,Ready Pla,Action|Sci-Fi|Thriller,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Sci-Fi, Thriller]",,175000000,140,3.5
247103,1.0,Olivia Cooke,140956.0,Ready Pla,Action|Sci-Fi|Thriller,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Sci-Fi, Thriller]",,175000000,140,3.5
247104,2.0,Ben Mendelsohn,140956.0,Ready Pla,Action|Sci-Fi|Thriller,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Sci-Fi, Thriller]",,175000000,140,3.5
247105,1.0,Lena Waithe,140956.0,Ready Pla,Action|Sci-Fi|Thriller,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Sci-Fi, Thriller]",,175000000,140,3.5
247106,2.0,T. J. Miller,140956.0,Ready Pla,Action|Sci-Fi|Thriller,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Action, Sci-Fi, Thriller]",,175000000,140,3.5
...,...,...,...,...,...,...,...,...,...,...,...
260363,2.0,Martyn Ford,171891.0,Generation,(no genres listed),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[(no genres listed)],,0,106,3.5
260364,0.0,Ahmad Ashkanani,171891.0,Generation,(no genres listed),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[(no genres listed)],,0,106,3.5
260365,0.0,Hidetada Yamagishi,171891.0,Generation,(no genres listed),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[(no genres listed)],,0,106,3.5
260366,0.0,Ken 'flex' Wheeler,171891.0,Generation,(no genres listed),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[(no genres listed)],,0,106,3.5


In [896]:
data = data.dropna()
data

Unnamed: 0,gender,name,movieId,title,genres,genres_vectorized,genres_list,year,budget,runtime,rating
0,2.0,Tom Hanks,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
1,2.0,Tim Allen,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
2,2.0,Don Rickles,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
3,2.0,Jim Varney,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
4,2.0,Wallace Shawn,1.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,30000000,81,3.92093
...,...,...,...,...,...,...,...,...,...,...,...
265922,0.0,Alex Thaler,193609.0,Andrew Dice Clay: Dice Rules,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1991.0,0,85,4.00000
265923,1.0,Sharon Bialy,193609.0,Andrew Dice Clay: Dice Rules,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1991.0,0,85,4.00000
265924,1.0,Debi Manwiller,193609.0,Andrew Dice Clay: Dice Rules,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1991.0,0,85,4.00000
265925,1.0,Mary Margiotta,193609.0,Andrew Dice Clay: Dice Rules,Comedy,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[Comedy],1991.0,0,85,4.00000


In [897]:
data[data['year'].isna()]

Unnamed: 0,gender,name,movieId,title,genres,genres_vectorized,genres_list,year,budget,runtime,rating


In [898]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 265723 entries, 0 to 265926
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   gender             265723 non-null  float64
 1   name               265723 non-null  object 
 2   movieId            265723 non-null  float64
 3   title              265723 non-null  object 
 4   genres             265723 non-null  object 
 5   genres_vectorized  265723 non-null  object 
 6   genres_list        265723 non-null  object 
 7   year               265723 non-null  float64
 8   budget             265723 non-null  int64  
 9   runtime            265723 non-null  int64  
 10  rating             265723 non-null  float64
dtypes: float64(4), int64(2), object(5)
memory usage: 24.3+ MB


In [899]:
data.describe()

Unnamed: 0,gender,movieId,year,budget,runtime,rating
count,265723.0,265723.0,265723.0,265723.0,265723.0,265723.0
mean,1.254626,42614.674085,1995.620014,29089210.0,110.431637,3.256164
std,0.836009,51981.024451,18.770753,46188720.0,25.947834,0.801884
min,0.0,1.0,1902.0,0.0,0.0,0.5
25%,0.0,3094.0,1989.0,0.0,95.0,2.833333
50%,2.0,7311.0,2000.0,10000000.0,106.0,3.388889
75%,2.0,78499.0,2009.0,38000000.0,121.0,3.833333
max,3.0,193609.0,2018.0,965313000.0,900.0,5.0


# Write to CSV file

In [None]:
# data.to_csv('data/cleaned_data.csv')