### Step 1: Import packages

In [1]:
import pandas as pd
import ast
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

### Step 2: Load raw data

In [2]:
# Load the credits and keywords datasets
credits = pd.read_csv("C:/Users/lbros/Documents/MIDS/W207/final_project/raw_data/credits.csv")
keywords = pd.read_csv("C:/Users/lbros/Documents/MIDS/W207/final_project/raw_data/keywords.csv")

### Step 3: Explore and clean both datasets

#### Clean the credits dataset
We begin with the credits dataset, containing cast and crew names for each film, in addition to the film's movie id. Both the cast and crew fields contain lists of dictionaries, with each dictionary containing the name of a cast or crew member corresponding to a particular movie role or behind-the-scenes role. We extract these names, concatenating first and last names for each individual, and create a final text string for each column.

In [3]:
# View the 1st 5 rows of credits, to view data structure
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [4]:
# Looking at the value for the first row in the cast column reveals a name key with a cast name by character
# This is our field of interest for each row - we'll extract all name key values for each row
credits.cast[0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [5]:
def process_credits(credits):
    """For each movie, extract cast member and crew member names"""
    
    # Extract the cast and crew names
    credits['cast_names'] = [" ".join(list(set([y['name'].replace(" ", "") for y in x]))
                                       ) for x in credits['cast'].apply(ast.literal_eval)]
    
    credits['crew_names'] = [" ".join(list(set([y['name'].replace(" ", "") for y in x]))
                                       ) for x in credits['crew'].apply(ast.literal_eval)]
    
    # Change the column order so that id is first, for joins
    first_column = credits.pop('id')
    credits.insert(0, 'id', first_column)
    
    # Replace empty string values with 'unknown'
    credits['cast_names'].replace(r'^\s*$', 'unknown', regex=True, inplace=True)
    credits['crew_names'].replace(r'^\s*$', 'unknown', regex=True, inplace=True)
    
    # Strip leading and trailing whitespace for the cast and crew member names columns
    credits['cast_names'] = credits['cast_names'].str.strip()
    credits['crew_names'] = credits['crew_names'].str.strip()

    # Strip punctuation from the cast and crew names if present
    p = re.compile(r'[^\w\s]+')
    credits['cast_names'] = [p.sub('', x) for x in credits['cast_names'].tolist()]
    credits['crew_names'] = [p.sub('', x) for x in credits['crew_names'].tolist()]
    
    # Cast the cast and crew names to lowercase
    credits['cast_names'] = credits['cast_names'].str.lower()
    credits['crew_names'] = credits['crew_names'].str.lower()
    
    return credits

In [6]:
# Create a new dataframe, consisting of only movie ids and cleaned cast and crew names as text strings
credits_final = process_credits(credits)[['id', 'cast_names', 'crew_names']].copy()

In [7]:
# Verify the extraction of cast and crew names
credits_final.head()

Unnamed: 0,id,cast_names,crew_names
0,862,erikvondetten johnmorris jimvarney lauriemetca...,angieglocka mickiemcgowan alecsokolow joelcohe...
1,8844,bradleypierce kirstendunst jameshandy leonardz...,kylebalda gregtaylor robertwcort chrisvanallsb...
2,15602,sophialoren darylhannah annmargret jacklemmon ...,howarddeutch markstevenjohnson jackkeller
3,31357,angelabassett wesleysnipes lorettadevine grego...,deborahschindler ronaldbass ezraswerdlow kenne...
4,11862,kieranculkin katemcgregorstewart eugenelevy ma...,nancymeyers charlesshyer elliotdavis alberthac...


#### Clean the keywords dataset

The keywords dataset also contains lists of dictionaries, with a name field that will need to be extracted. We extract the name terms, similarly to how we did for the credits dataset. However, we perform additional processing to reduce the number of word features that we will eventually have in our movies' dataset.

In [8]:
def stem_stop_text(text):
    """Remove stopwords and stem the words in each movie description"""
    
    # Create an instance of a Porter Stemmer object
    ps = PorterStemmer()
    
    # Create a list of English stopwords from the nltk corpus
    my_stop = stopwords.words('english')
    
    # Tokenize the text and remove stopwords
    tokens = [w for w in word_tokenize(text) if w not in my_stop]
    
    # Stem the tokens and rejoin
    final_text = " ".join([ps.stem(token) for token in tokens])
    
    return final_text

In [9]:
def process_keywords(keywords):
    """For the keywords dataframe, extract a list of keywords for each row"""
    
    # Extract the value belonging to each key phrase's name key for each row
    # For each row, create a single string of text to represent the name(s)
    keywords['description'] = [" ".join(list(set([y['name'] for y in x]))
                                       ) for x in keywords['keywords'].apply(ast.literal_eval)]
    
    # Replace empty string values with 'unknown'
    keywords['description'].replace(r'^\s*$', 'unknown', regex=True, inplace=True)
    
    # Strip punctuation from the descriptions
    p = re.compile(r'[^\w\s]+')
    keywords['description'] = [p.sub('', x) for x in keywords['description'].tolist()]
    
    # Set all descriptions to lowercase
    keywords['description'] = keywords['description'].str.lower()
    
    # Apply the function to stem and remove stopwords
    keywords['description'] = keywords.apply(lambda row: stem_stop_text(row['description']), axis=1)
    
    return keywords

In [10]:
# Create a new dataframe, consisting of only movie ids and cleaned descriptions as text strings
descriptions = process_keywords(keywords)[['id', 'description']].copy()

In [11]:
# View the 1st 5 rows of  this new dataframe
descriptions.head()

Unnamed: 0,id,description
0,862,toy come life new toy toy jealousi friend boy ...
1,8844,board game base children book disappear reclus...
2,15602,fish duringcreditssting best friend old men
3,31357,chick flick base novel interraci relationship ...
4,11862,midlif crisi mother daughter relationship age ...


### Step 4: Join the cleaned credits and descriptions dataframes, so that each movie has a value for description, cast_names, and crew_names

In [12]:
# Join the dataframes and view the first 5 rows
merged_credits_keywords = credits_final.merge(descriptions,on='id', how='outer')
merged_credits_keywords.head()

Unnamed: 0,id,cast_names,crew_names,description
0,862,erikvondetten johnmorris jimvarney lauriemetca...,angieglocka mickiemcgowan alecsokolow joelcohe...,toy come life new toy toy jealousi friend boy ...
1,8844,bradleypierce kirstendunst jameshandy leonardz...,kylebalda gregtaylor robertwcort chrisvanallsb...,board game base children book disappear reclus...
2,15602,sophialoren darylhannah annmargret jacklemmon ...,howarddeutch markstevenjohnson jackkeller,fish duringcreditssting best friend old men
3,31357,angelabassett wesleysnipes lorettadevine grego...,deborahschindler ronaldbass ezraswerdlow kenne...,chick flick base novel interraci relationship ...
4,11862,kieranculkin katemcgregorstewart eugenelevy ma...,nancymeyers charlesshyer elliotdavis alberthac...,midlif crisi mother daughter relationship age ...


In [13]:
# Check for missing values within the columns - there are no missing values
merged_credits_keywords.isna().sum()

id             0
cast_names     0
crew_names     0
description    0
dtype: int64

#### However, the dataframe is longer than either of the individual components, and the movie id field contains duplicates.

In [14]:
print(len(credits_final), len(descriptions), len(merged_credits_keywords))

45476 46419 46496


In [15]:
merged_credits_keywords['id'].duplicated().any()

True

In [16]:
# Both the original credits csv and the keywords csv contained duplicates
print(credits['id'].duplicated().any())
print(keywords['id'].duplicated().any())

True
True


In [17]:
# Drop the duplicated rows, across all columns
merged_credits_keywords.drop_duplicates(inplace=True)

In [18]:
# Now, the final merged dataframe is smaller than the cleaned versions, which contained duplicates
print(len(credits_final), len(descriptions), len(merged_credits_keywords))

45476 46419 45436


In [19]:
# Write the cleaned version of the merged dataframe to a csv file
merged_credits_keywords.to_csv('C:/Users/lbros/Documents/MIDS/W207/final_project/clean_data/credits_keywords.csv')