In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval

In [2]:
data = pd.read_csv("uncleaned data/credits.csv")

In [3]:
data.shape

(1000, 5)

The data contains the casts and crew members of 1,000 movies, the cast and crew columns are in a json format which contains the credit_id, name and other information about the role each crew and cast member played in the creation of the movie.    
The data will be cleaned to make way for easy and faster manipulations which involves resturcturing exisiting variable and creating new variables.

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,cast,crew
0,0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
# Using literal_eval to parse the stringified features (json) into their corresponding python objects a list of dictionaries

features = ["cast", "crew"]

for feature in features:
    data[feature] = data[feature].apply(literal_eval)

In [6]:
data["crew"][0][:5]             # the first five crew members of the first movie (Avatar) in the data.

[{'credit_id': '52fe48009251416c750aca23',
  'department': 'Editing',
  'gender': 0,
  'id': 1721,
  'job': 'Editor',
  'name': 'Stephen E. Rivkin'},
 {'credit_id': '539c47ecc3a36810e3001f87',
  'department': 'Art',
  'gender': 2,
  'id': 496,
  'job': 'Production Design',
  'name': 'Rick Carter'},
 {'credit_id': '54491c89c3a3680fb4001cf7',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Sound Designer',
  'name': 'Christopher Boyes'},
 {'credit_id': '54491cb70e0a267480001bd0',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Supervising Sound Editor',
  'name': 'Christopher Boyes'},
 {'credit_id': '539c4a4cc3a36810c9002101',
  'department': 'Production',
  'gender': 1,
  'id': 1262,
  'job': 'Casting',
  'name': 'Mali Finn'}]

In [7]:
# from the crew data, all the members in the sound department.

[print(i) for i in data["crew"][0] if i["department"] == "Sound"]

{'credit_id': '54491c89c3a3680fb4001cf7', 'department': 'Sound', 'gender': 0, 'id': 900, 'job': 'Sound Designer', 'name': 'Christopher Boyes'}
{'credit_id': '54491cb70e0a267480001bd0', 'department': 'Sound', 'gender': 0, 'id': 900, 'job': 'Supervising Sound Editor', 'name': 'Christopher Boyes'}
{'credit_id': '5544ee3b925141499f0008fc', 'department': 'Sound', 'gender': 2, 'id': 1729, 'job': 'Original Music Composer', 'name': 'James Horner'}
{'credit_id': '5495a0fac3a3686ae9004468', 'department': 'Sound', 'gender': 0, 'id': 6883, 'job': 'Music Editor', 'name': 'Dick Bernstein'}
{'credit_id': '54959706c3a3686af3003e81', 'department': 'Sound', 'gender': 0, 'id': 8159, 'job': 'Sound Effects Editor', 'name': 'Shannon Mills'}
{'credit_id': '54491d58c3a3680fb1001ccb', 'department': 'Sound', 'gender': 0, 'id': 8160, 'job': 'Foley', 'name': 'Dennie Thorpe'}
{'credit_id': '54491d6cc3a3680fa5001b2c', 'department': 'Sound', 'gender': 0, 'id': 8163, 'job': 'Foley', 'name': 'Jana Vance'}
{'credit_id'

[None, None, None, None, None, None, None, None, None, None, None, None, None]

### Extracting crew members from the crew data

In [8]:
# testing a function to get the directors of each movie

def get_director(x):                      
    
    for i in x:                                 
        
        if i["job"] == "Director":      # condition
            return i["name"]
        
    return np.nan                       # if the condition is not True, asign NAN.

data["crew"].apply(get_director)

0            James Cameron
1           Gore Verbinski
2               Sam Mendes
3        Christopher Nolan
4           Andrew Stanton
              ...         
995    Richard LaGravenese
996           Danny DeVito
997            Ben Stiller
998        George Armitage
999           Mike Nichols
Name: crew, Length: 1000, dtype: object

In [9]:
# generalizing the get_director function.

def get_json_value(x, cond_key, cond_value, res_key):
    
    for j in x:
        
        if j[cond_key] == cond_value:
            return j[res_key]
        
    return np.nan


def asign_variable(var, cond_key, value, res_key):
    return [get_json_value(data[var][i], cond_key, value, res_key) for i in range(data.shape[0])]

``` asign_variable("crew", "job", "Director", "name") ```

This can be defined as: In the -crew- column find the title of a -job- that is -Director- and extract the -name-

In [10]:
# Extracting the director, producer, screen play and music composer of the movies.

# Director
data["director"] = asign_variable("crew", "job", "Director", "name")

# Producer
data["producer"] = asign_variable("crew", "job", "Producer", "name")

# writer
data["screenplay"] = asign_variable("crew", "job", "Screenplay", "name")

# Music Composer
data["music_composer"] = asign_variable("crew", "job", "Original Music Composer", "name")

data.head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,cast,crew,director,producer,screenplay,music_composer
0,0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...",James Cameron,James Cameron,James Cameron,James Horner
1,1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",Gore Verbinski,Jerry Bruckheimer,Ted Elliott,Hans Zimmer
2,2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...",Sam Mendes,Barbara Broccoli,John Logan,Thomas Newman
3,3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...",Christopher Nolan,Charles Roven,Christopher Nolan,Hans Zimmer
4,4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...",Andrew Stanton,Colin Wilson,Andrew Stanton,


### Replacing the cast data with a variabel containing a list of the first 5 cast members for each movie.

In [11]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        
        # Check if more than 4 elements exist. If yes, return only first five.
        # If no, return entire list.
        if len(names) > 4:
            names = names[:5]
            
        return names
    
    # Return empty list in case of missing/malformed data
    return []

In [12]:
data["top_cast"] = data["cast"].apply(get_list)

data["top_cast"] = [", ".join(lst) for lst in data["top_cast"]]

data.head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,cast,crew,director,producer,screenplay,music_composer,top_cast
0,0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...",James Cameron,James Cameron,James Cameron,James Horner,"Sam Worthington, Zoe Saldana, Sigourney Weaver..."
1,1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",Gore Verbinski,Jerry Bruckheimer,Ted Elliott,Hans Zimmer,"Johnny Depp, Orlando Bloom, Keira Knightley, S..."
2,2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...",Sam Mendes,Barbara Broccoli,John Logan,Thomas Newman,"Daniel Craig, Christoph Waltz, Léa Seydoux, Ra..."
3,3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...",Christopher Nolan,Charles Roven,Christopher Nolan,Hans Zimmer,"Christian Bale, Michael Caine, Gary Oldman, An..."
4,4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...",Andrew Stanton,Colin Wilson,Andrew Stanton,,"Taylor Kitsch, Lynn Collins, Samantha Morton, ..."


In [13]:
# Dropping the crew column

data = data.drop(["Unnamed: 0", "cast", "crew"], axis = 1)

data.head(10)

Unnamed: 0,movie_id,title,director,producer,screenplay,music_composer,top_cast
0,19995,Avatar,James Cameron,James Cameron,James Cameron,James Horner,"Sam Worthington, Zoe Saldana, Sigourney Weaver..."
1,285,Pirates of the Caribbean: At World's End,Gore Verbinski,Jerry Bruckheimer,Ted Elliott,Hans Zimmer,"Johnny Depp, Orlando Bloom, Keira Knightley, S..."
2,206647,Spectre,Sam Mendes,Barbara Broccoli,John Logan,Thomas Newman,"Daniel Craig, Christoph Waltz, Léa Seydoux, Ra..."
3,49026,The Dark Knight Rises,Christopher Nolan,Charles Roven,Christopher Nolan,Hans Zimmer,"Christian Bale, Michael Caine, Gary Oldman, An..."
4,49529,John Carter,Andrew Stanton,Colin Wilson,Andrew Stanton,,"Taylor Kitsch, Lynn Collins, Samantha Morton, ..."
5,559,Spider-Man 3,Sam Raimi,Laura Ziskin,Sam Raimi,Christopher Young,"Tobey Maguire, Kirsten Dunst, James Franco, Th..."
6,38757,Tangled,Byron Howard,Roy Conli,Dan Fogelman,Alan Menken,"Zachary Levi, Mandy Moore, Donna Murphy, Ron P..."
7,99861,Avengers: Age of Ultron,Joss Whedon,Kevin Feige,,Danny Elfman,"Robert Downey Jr., Chris Hemsworth, Mark Ruffa..."
8,767,Harry Potter and the Half-Blood Prince,David Yates,David Heyman,Steve Kloves,Nicholas Hooper,"Daniel Radcliffe, Rupert Grint, Emma Watson, T..."
9,209112,Batman v Superman: Dawn of Justice,Zack Snyder,Charles Roven,David S. Goyer,Hans Zimmer,"Ben Affleck, Henry Cavill, Gal Gadot, Amy Adam..."


In [14]:
# Saving changes

data.to_csv("movie credits.csv")