# Data Collection and Cleaning

In [1]:
import numpy as np
import pandas as pd
import duckdb
import matplotlib.pyplot as plt
import seaborn as sns

### Read Data from CSV Files

In [2]:
movies_data = pd.read_csv("data/movies_metadata.csv")
ratings_data = pd.read_csv("data/ratings_small.csv")

  movies_data = pd.read_csv("data/movies_metadata.csv")


In [3]:
print(movies_data.shape)
movies_data.head()

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
print(ratings_data.shape)
ratings_data.head()

(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Clean Data

In [5]:
# Filter to select only released movies
movies_data = movies_data[movies_data['status'] == 'Released']

In [6]:
# Drop irrelevant columns
movies_data.drop(columns=['adult', 'homepage', 'imdb_id', 'status', 'tagline',
                          'overview', 'poster_path', 'video', 'spoken_languages',
                          'production_companies', 'production_countries'], inplace=True)

In [7]:
# Convert release dates to datetime object
print(movies_data['release_date'].dtype)
movies_data['release_date'] = pd.to_datetime(movies_data['release_date'],
                                             format='mixed', errors='coerce')
print(movies_data['release_date'].dtype)

object
datetime64[ns]


In [8]:
# Convert budget from object to float
print(movies_data['budget'].dtype)
movies_data['budget'] = pd.to_numeric(movies_data['budget'], errors='coerce')
print(movies_data['budget'].dtype)

# Convert popularity from object to float
print(movies_data['popularity'].dtype)
movies_data['popularity'] = pd.to_numeric(movies_data['popularity'], errors='coerce')
print(movies_data['popularity'].dtype)

object
int64
object
float64


In [9]:
# Identify columns with missing data
print(movies_data.columns[movies_data.isnull().any()])

Index(['belongs_to_collection', 'original_language', 'release_date',
       'runtime'],
      dtype='object')


In [10]:
# Create dummy variable for whether or not the movie is part of a series
movies_data['belongs_to_collection'] = movies_data['belongs_to_collection'].notna()
print(movies_data['belongs_to_collection'].head())

0     True
1    False
2     True
3    False
4     True
Name: belongs_to_collection, dtype: bool


In [11]:
# Drop rows with missing data
movies_data.dropna(inplace=True)
print(movies_data.shape)

(44686, 13)


In [12]:
# Create dummy variable for whether or not the original language is English
movies_data['original_language'] = movies_data['original_language'] == 'en'
movies_data = movies_data.rename(columns={'original_language': 'original_english'})
print(movies_data['original_english'].head())

0    True
1    True
2    True
3    True
4    True
Name: original_english, dtype: bool


In [13]:
# Create dummy variable for whether or not the movie was released in the holiday season

# Extract month from release_date variable
movies_data['month'] = movies_data['release_date'].dt.month
movies_data['month'] = movies_data['month'].astype(int)
print(movies_data['month'].head())

# Holiday months are December-February and June-August, the rest are non-holiday
movies_data['holiday'] = (
    ((movies_data['month'] >= 6) & (movies_data['month'] <= 8)) | 
    ((movies_data['month'] == 12) | (movies_data['month'] <= 2)))
print(movies_data['holiday'].head())

0    10
1    12
2    12
3    12
4     2
Name: month, dtype: int64
0    False
1     True
2     True
3     True
4     True
Name: holiday, dtype: bool


In [14]:
# Find average of ratings for each movie
ratings_data = duckdb.sql("SELECT movieId, AVG(rating) AS rating_average, \
COUNT(rating) AS rating_count FROM ratings_data GROUP BY movieId").df()
ratings_data.head()

Unnamed: 0,movieId,rating_average,rating_count
0,1953,4.021739,46
1,2968,3.569767,43
2,144,3.326923,26
3,153,2.782946,129
4,222,3.931818,22


### Create Joined Dataset

In [15]:
# Merge the data frames
data = duckdb.sql("SELECT * FROM movies_data INNER JOIN ratings_data \
ON movies_data.id = ratings_data.movieId").df()
print(data.shape)
data.drop(columns=['movieId'], inplace=True)
data.head()

(2814, 18)


Unnamed: 0,belongs_to_collection,budget,genres,id,original_english,original_title,popularity,release_date,revenue,runtime,title,vote_average,vote_count,month,holiday,rating_average,rating_count
0,True,58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",710,True,GoldenEye,14.686036,1995-11-16,352194034.0,130.0,GoldenEye,6.6,1194.0,11,False,1.5,2
1,False,98000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",1408,True,Cutthroat Island,7.284477,1995-12-22,10017322.0,119.0,Cutthroat Island,5.7,137.0,12,True,3.616279,43
2,False,52000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",524,True,Casino,10.137389,1995-11-22,116112375.0,178.0,Casino,7.8,1343.0,11,False,3.555556,36
3,False,16500000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",4584,True,Sense and Sensibility,10.673167,1995-12-13,135000000.0,136.0,Sense and Sensibility,7.2,364.0,12,True,5.0,1
4,False,4000000,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",5,True,Four Rooms,9.026586,1995-12-09,4300000.0,98.0,Four Rooms,6.5,539.0,12,True,3.267857,56


In [16]:
data.to_csv('data/data.csv', index=False)