In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pickle
%matplotlib inline

In [3]:
!ls the-movies-dataset

credits.csv
keywords.csv
links.csv
links_small.csv
movies_metadata.csv
ratings.csv
ratings_small.csv


# Credits

In [4]:
credits = pd.read_csv('the-movies-dataset/credits.csv')

In [5]:
credits.tail()

Unnamed: 0,cast,crew,id
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506
45475,[],"[{'credit_id': '593e676c92514105b702e68e', 'de...",461257


In [78]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
cast    45476 non-null object
crew    45476 non-null object
id      45476 non-null int64
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


# Data cleaning for credits.csv

**1) checking if we have null values in our data**

In [71]:
credits.notnull().all()

cast    True
crew    True
id      True
dtype: bool

**2) check to see if we have duplicate values**

In [126]:
credits.duplicated().any()

True

In [127]:
credits.drop_duplicates(inplace=True)

In [128]:
len(credits)

45439

**3) filtering out the rows with no cast or crew information**

In [58]:
mask = np.array([(len(eval(credits.iloc[i, 0]))>0 and len(eval(credits.iloc[i,1]))>0) for i in range(len(credits))])
credits_clean = credits[mask]

In [108]:
#y = lambda i, j:eval(credits.iloc[i,0])[j]['name'] if len(eval(credits.iloc[i,0]))>0 else ''
#actor_names = [y(i,j) for i in range(len(credits)) for j in range(len(eval(credits.iloc[i,0])))]
#file = open('actor_names.pkl', 'wb')
#pickle.dump(actor_names, file)
#file.close()

# Keywords

In [11]:
keywords = pd.read_csv('the-movies-dataset/keywords.csv')

In [12]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [77]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
id          46419 non-null int64
keywords    46419 non-null object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


# Data cleaning for keywords.csv

**1) looking for null values in the data**

In [72]:
keywords.notnull().all()

id          True
keywords    True
dtype: bool

In [73]:
len(keywords)

46419

**2) check to see if we have duplicate values**

In [119]:
keywords.duplicated().any()

True

In [121]:
keywords.drop_duplicates(inplace=True)

In [122]:
len(keywords)

45432

**3) filtering out the rows with no keywords**

In [123]:
mask = np.array([len(eval(keywords.iloc[i, 1])) > 0 for i in range(len(keywords))])
clean_keywords = keywords[mask]

In [124]:
len(clean_keywords)

31092

# Links

In [103]:
links = pd.read_csv('the-movies-dataset/links.csv')

In [104]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [105]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45843 entries, 0 to 45842
Data columns (total 3 columns):
movieId    45843 non-null int64
imdbId     45843 non-null int64
tmdbId     45624 non-null float64
dtypes: float64(1), int64(2)
memory usage: 1.0 MB


# Data cleaning for links.csv

**1) checking to see if we have null values in our data**

In [106]:
links.notnull().all()

movieId     True
imdbId      True
tmdbId     False
dtype: bool

**2) droping the null values (since we have a small fraction of data to lose)**

In [107]:
links_clean = links.dropna()

**3) casting values of tmdbId from float64 to int64**

In [108]:
links_clean.tmdbId = links_clean.tmdbId.astype(np.int64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


**4) checking to see if we have duplicate values**

In [116]:
links_clean.duplicated().any()

False

# Metadata

In [131]:
metadata = pd.read_csv('the-movies-dataset/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [132]:
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


# Data cleaning for movies-metadata.csv

**1) check if we have null values**

In [135]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

**2) droping some columns with either high number of null values of having low relevance for this project**

In [138]:
metadata_clean = metadata.drop(['belongs_to_collection','homepage','imdb_id','original_title','poster_path','tagline', 'production_companies',
              'revenue', 'budget'],axis = 1)

**3) replacing the values of 'xx' in original_language with np.nan values**

In [139]:
metadata_clean[metadata_clean.original_language=='xx'] = np.nan

**4) casting id, popularity columns values from string to numeric values**

In [142]:
metadata_clean.id = pd.to_numeric(metadata_clean.id, errors = 'coerce')
metadata_clean.popularity = pd.to_numeric(metadata_clean.popularity, errors = 'coerce')

**5) droping null values**

In [144]:
metadata_clean.dropna(inplace=True)

**6) changing the adult, status, video columns to category data type**

In [145]:
metadata_clean.adult = metadata_clean.adult.astype('category')
metadata_clean.status = metadata_clean.status.astype('category')
metadata_clean.video = metadata_clean.video.astype('category')

**7) turning the release_data column to datatime format**

In [147]:
metadata_clean.release_date = pd.to_datetime(metadata_clean.release_date, format = '%Y-%m-%d')

**8) setting and sorting the release date column as the index of the dataframe**

In [150]:
metadata_clean.set_index('release_date', inplace=True)
metadata_clean.sort_index(inplace=True)

**9) checking for duplicate values in the dataframe**

In [158]:
metadata_clean.duplicated().any()

True

In [160]:
metadata_clean.drop_duplicates(inplace=True)

# Ratings

In [153]:
ratings = pd.read_csv('the-movies-dataset/ratings.csv')

In [154]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [155]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


# Data cleaning for ratings.csv

**1) checking for duplicate values**

In [162]:
ratings.duplicated().any()

False

**2) checking for null values**

In [163]:
ratings.notnull().all()

userId       True
movieId      True
rating       True
timestamp    True
dtype: bool

**3) changing the rating columns of the dataframe to category type**

In [171]:
ratings.rating.unique()
ratings.rating = ratings.rating.astype('category')