### Import packages

In [1]:
import pandas as pd
import numpy as np

### Define working directories

In [2]:
path_raw_data = 'C:/users/lbros/documents/mids/w207/final_project/raw_data/'
path_clean_data = 'C:/users/lbros/documents/mids/w207/final_project/clean_data/'

### Load clean ratings data

In [3]:
# load ratings dataframe
ratings_df = pd.read_csv(path_clean_data + 'ratings_final.csv')

In [4]:
# print dataframe shape
ratings_df.shape

(24130205, 7)

In [5]:
# print dataframe columns
ratings_df.columns

Index(['Unnamed: 0', 'userId', 'movieId', 'rating', 'timestamp', 'imdbId',
       'tmdbId'],
      dtype='object')

In [6]:
# inspect first five rows
ratings_df.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,59,4,223,1,1042668576,109445,2292.0
1,60,4,415,1,1042667925,106292,9409.0
2,61,4,648,1,1042674800,117060,954.0
3,62,4,1097,1,1042667925,83866,601.0
4,63,4,1197,1,1042667956,93779,2493.0


### Load clean movies data

In [7]:
# load movies dataframe
movies_df = pd.read_csv(path_clean_data + 'movies_final.csv')

In [8]:
# print dataframe shape
movies_df.shape

(45464, 179)

In [9]:
# print dataframe columns
movies_df.columns

Index(['Unnamed: 0', 'id', 'adult', 'belongs_to_collection', 'budget',
       'originally_english', 'overview', 'popularity', 'production_companies',
       'production_countries',
       ...
       'zu', 'canceled', 'in-production', 'planned', 'post-production',
       'released', 'rumored', 'cast_names', 'crew_names', 'description'],
      dtype='object', length=179)

In [10]:
# inspect first five rows
movies_df.head()

Unnamed: 0.1,Unnamed: 0,id,adult,belongs_to_collection,budget,originally_english,overview,popularity,production_companies,production_countries,...,zu,canceled,in-production,planned,post-production,released,rumored,cast_names,crew_names,description
0,0,862,0,1,30000000,1,led woodi andi toy live happili room andi birt...,21.946943,pixar animation studios,us,...,0,0,0,0,0,1,0,johnratzenberger rleeermey donrickles erikvond...,annmrockwell jeffpratt ashbrannon mickiemcgowa...,boy next door boy friendship friend rivalri to...
1,1,8844,0,0,65000000,1,sibl judi peter discov enchant board game open...,17.015539,interscope communications teitler film tristar...,us,...,0,0,0,0,0,1,0,bonniehunt laurabellbundy jameshandy gillianba...,gregtaylor thomaseackerman williamteitler robe...,base children book board game giant insect new...
2,2,15602,0,1,0,1,famili wed reignit ancient feud nextdoor neigh...,11.7129,warner bros. lancaster gate,us,...,0,0,0,0,0,1,0,burgessmeredith annmargret darylhannah jacklem...,jackkeller howarddeutch markstevenjohnson,best friend old men duringcreditssting fish
3,3,31357,0,0,16000000,1,cheat mistreat step women hold breath wait elu...,3.859495,twentieth century fox film corporation,us,...,0,0,0,0,0,1,0,lamontjohnson angelabassett lelarochon whitney...,deborahschindler ezraswerdlow forestwhitaker c...,interraci relationship chick flick divorc sing...
4,4,11862,0,1,0,1,georg bank recov daughter wed receiv news she ...,8.387519,sandollar productions touchstone pictures,us,...,0,0,0,0,0,1,0,janeadams kieranculkin stevemartin lorialan ki...,adambernardi nancymeyers elliotdavis alansilve...,gynecologist daughter pregnanc midlif crisi ba...


### Evaluate ability to join dataframes based on movieId

The **movieId** on ratings_df seems to be a **different key** than **id** on movies_df.

In [15]:
rated_movies = ratings_df.movieId.unique()
print('There are {} unique movies rated in the ratings_df.'.format(len(rated_movies)))

There are 44975 unique movies rated in the ratings_df.


In [16]:
repr_movies = movies_df.id.unique()
print('There are {} unique movies represented in the movies_df.'.format(len(repr_movies)))

There are 45430 unique movies represented in the movies_df.


In [27]:
rated_not_in_repr = rated_movies[~np.isin(rated_movies, repr_movies)]
print('There are {} movies in the ratings_df not in the movies_df.'.format(len(rated_not_in_repr)))
rated_not_in_repr

There are 37419 movies in the ratings_df not in the movies_df.


array([  1097,   1197,   1210, ..., 165649, 171051, 171221], dtype=int64)

In [28]:
repr_not_in_rated = repr_movies[~np.isin(repr_movies, rated_movies)]
print('There are {} movies in the movies_df not in the ratings_df.'.format(len(repr_not_in_rated)))
repr_not_in_rated

There are 37874 movies in the movies_df not in the ratings_df.


array([ 15602,  31357,  11862, ...,  67758, 227506, 461257], dtype=int64)