In [1]:
import pandas as pd
import numpy as np
import ast
from ast import literal_eval
from sklearn.feature_extraction.text import *
from sklearn.preprocessing import MultiLabelBinarizer
import re
from datetime import datetime
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
# Load the movies dataframe
movies_df = pd.read_csv("C:/Users/britt/Desktop/W207/final_project/data/movies_metadata.csv", 
                        low_memory=False, parse_dates=True)

# Read in the cleaned merged credits and keywords file
credits_keywords = pd.read_csv("C:/Users/britt/Desktop/W207/final_project/data/credits_keywords.csv")

In [3]:
# View the first 5 rows
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
# Create a copy of the movies dataframe, keeping only the desired columns
movies = movies_df[['imdb_id', 'id', 'adult', 'belongs_to_collection', 'budget', 'genres', 
                    'original_language', 'original_title', 'overview', 'popularity', 
                    'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 
                    'spoken_languages', 'status', 'tagline', 'title', 'video', 
                    'vote_average', 'vote_count']].copy(deep=True)

#### Define a function to stem words and remove stopwords in text fields overview and tagline

In [5]:
def stem_stop_text(text):
    """Remove stopwords and stem the words in each movie description"""
    
    # Create an instance of a Porter Stemmer object
    ps = PorterStemmer()
    
    # Create a list of English stopwords from the nltk corpus
    my_stop = stopwords.words('english')
    
    if not text:
        return 'unknown'
    
    else:
        # Tokenize the text and remove stopwords
        tokens = [w for w in word_tokenize(text) if w not in my_stop]
    
        # Stem the tokens and rejoin
        final_text = " ".join([ps.stem(token) for token in tokens])
    
    return final_text

#### Clean the adults field

In [6]:
# The adult field should only contain boolean values
movies.adult.unique()

array(['False', 'True', ' - Written by Ørnås',
       ' Rune Balot goes to a casino connected to the October corporation to try to wrap up her case once and for all.',
       ' Avalanche Sharks tells the story of a bikini contest that turns into a horrifying affair when it is hit by a shark avalanche.'],
      dtype=object)

In [7]:
# An inspection of the non-boolean rows reveals that the ids appear to be dates rather than numbers and there are no ratings
# We'll drop these rows from the dataset
movies.loc[(movies.adult != 'False') & (movies.adult != 'True')]

Unnamed: 0,imdb_id,id,adult,belongs_to_collection,budget,genres,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,0,1997-08-20,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...",104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,...,1,,,,,,,,,
29503,0,2012-09-29,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...",68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,...,12,,,,,,,,,
35587,0,2014-01-01,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...",82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Beware Of Frost Bites,...,22,,,,,,,,,


In [8]:
def clean_adult(movies):
    """Drop rows where the adult column does not have a boolean value"""
    
    # Drop rows without a boolean value for the adults field
    movies = movies[movies['adult'].isin(['True', 'False'])]
    
    # Convert the adult column to one-hot encoding
    movies['adult'].replace({'False': 0, 'True': 1}, inplace=True)
    
    return movies

In [9]:
# Apply the cleaning function to filter the movies dataframe
movies = clean_adult(movies)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


#### Clean belongs_to_collection
Since we removed nonsensical values by dropping the three rows where movies had floating point values for this field (these movies had no ratings), we only need to address Nan values and the dictionaries with collection names.

In [10]:
# The majority of movies don't belong to a collection - use one/hot encoding to indicate yes/no to in collection
print(sum(~movies_df['belongs_to_collection'].isna()))
print(sum(movies_df['belongs_to_collection'].isna()))

4494
40972


In [11]:
def clean_collection(movies):
    '''Convert the belongs to collection column with boolean values'''
    
    # Repace Nan values with 0, replace all other values with 1
    movies['belongs_to_collection'] = np.where(movies['belongs_to_collection'].isnull(), 0, 1)
    
    return movies

In [12]:
# Apply this function to convert the collection field to boolean values
movies = clean_collection(movies)

#### Clean budget
Now that the 3 movies with letter-containing strings for budget have been dropped, we only need to convert the budget series to numbers.

In [13]:
def clean_budget(movies):
    """Convert the budget field to numeric form"""
    movies['budget'] = pd.to_numeric(movies['budget'])
    
    return movies

In [14]:
# Apply this function to convert the budget field to numbers
movies = clean_budget(movies)

#### Clean genres

In [15]:
def clean_genres(movies):
    """Extract the genre(s) for each movie and convert genres to one-hot encoded columns"""
    
    # Extract the name field from genres - will now contain lists of values or empty lists for NaN
    movies['genres'] = [list(set([y['name'].lower() if x != '[]' else 'unknown' for y in x]
                                )) for x in movies['genres'].apply(ast.literal_eval)]
    
    # Using an instance of a MultiLabelBinarizer object, convert the lists of genres to columns of one-hot encoding
    mlb = MultiLabelBinarizer()
    movies = movies[[x for x in movies.columns if x != 'genres']].join(
        pd.DataFrame(mlb.fit_transform(movies.pop('genres')), columns=mlb.classes_, index=movies.index))

    return movies

In [16]:
# Apply this function to create one-hot encoding for a list of genre names
movies = clean_genres(movies)

#### Clean original languges

In [17]:
# The field is of object type and contains na values to be filled
print(f'Languages: {movies.original_language.unique()}', '\n')

print(f'Count of movies without original language provided: {sum(movies.original_language.isna())}')

Languages: ['en' 'fr' 'zh' 'it' 'fa' 'nl' 'de' 'cn' 'ar' 'es' 'ru' 'sv' 'ja' 'ko'
 'sr' 'bn' 'he' 'pt' 'wo' 'ro' 'hu' 'cy' 'vi' 'cs' 'da' 'no' 'nb' 'pl'
 'el' 'sh' 'xx' 'mk' 'bo' 'ca' 'fi' 'th' 'sk' 'bs' 'hi' 'tr' 'is' 'ps'
 'ab' 'eo' 'ka' 'mn' 'bm' 'zu' 'uk' 'af' 'la' 'et' 'ku' 'fy' 'lv' 'ta'
 'sl' 'tl' 'ur' 'rw' 'id' 'bg' 'mr' 'lt' 'kk' 'ms' 'sq' nan 'qu' 'te' 'am'
 'jv' 'tg' 'ml' 'hr' 'lo' 'ay' 'kn' 'eu' 'ne' 'pa' 'ky' 'gl' 'uz' 'sm'
 'mt' 'hy' 'iu' 'lb' 'si'] 

Count of movies without original language provided: 11


In [18]:
# The majority of the original languages are in English and some languages have little representation (i.e. 1 movie)
# We'll encode the original language status as a yes or no to being English and treat unknown as 'no'
movies.original_language.value_counts()

en    32269
fr     2438
it     1529
ja     1350
de     1080
      ...  
uz        1
la        1
sm        1
mt        1
rw        1
Name: original_language, Length: 89, dtype: int64

In [19]:
def clean_original_language(movies):
    """Clean original languages values and reduce to English status"""
    
    # Use unknown for the original language when unspecified
    movies['original_language'].fillna('unknown', inplace=True)
    
    # Repace Nan values with 0, replace all other values with 1
    movies['original_language'] = np.where(movies['original_language'] == 'en', 1, 0)
    
    # Rename the original languages column to reflect yes/no to specified English
    movies = movies.rename(columns={'original_language': 'originally_english'})
    
    return movies

In [20]:
# Apply the above function to reduced original_language to whether or not the original language was English
movies = clean_original_language(movies)

#### Clean original title and title fields
The title field contains non-English names and accents, as well as timestamps, dates, and numbers that represent title names. We'll implement punctuation processing here (replace with whitespace), convert titles to lowercase, and strip leading/trailing whitespaces. Since the original title field contains non-ASCII characters, we will drop this field and keep only the title field.

In [21]:
# There are 0 movies where both the original title and the title are NaN
print(len(movies[movies.original_title.isna() & movies.title.isna()]))

# There are 0 movies with NaN for the original title and a title value for the 
print(len(movies[(movies.original_title.isna()) & (~movies.title.isna())]))

# There are 3 movies without a title, but with the original title - we'll fill the title field with the original title
print(len(movies[movies.title.isna() & ~movies.original_title.isna()]))

0
0
3


In [22]:
def clean_titles(movies):
    """Reduce and clean the title fields"""
    
    # Replace null title values with the value from original title
    movies['title'].fillna(movies['original_title'], inplace=True)
    
    # Convert all titles to lowercase
    movies['title'] = movies['title'].str.lower()
    
    # Remove non-ascii characters in the title field
    movies['title'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
    
    # Remove punctuation
    p = re.compile(r'[^\w\s]+')
    movies['title'] = [p.sub('', x) for x in movies['title'].tolist()]
    
    # Strip leading and trailing whitespace
    movies['title'] = movies['title'].str.strip()
    
    # Drop the original_title field
    movies.drop('original_title', axis=1, inplace=True)

    return movies

In [23]:
# Apply the above function to clean title names and drop the original title field
movies = clean_titles(movies)

#### Clean overview
This section contains descriptions, similar to the movie keywords in the keywords.csv file. 

In [24]:
# There are 954 movies without an overview, so we'll use 'unknown' as a placeholder for these
print(sum(movies.overview.isna()))

# An empty list is not used as a placeholder in this field
print(len(movies[movies.overview == '[]']))

954
0


In [25]:
def clean_overview(movies):
    """Clean the text descriptions of the movie overview field"""
    
    # Fill the Nan values with unknown
    movies['overview'].fillna('unknown', inplace=True)
    
    # Remove non-ascii characters in the overview field
    movies['overview'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
    
    # Remove punctuation
    p = re.compile(r'[^\w\s]+')
    movies['overview'] = [p.sub('', x) for x in movies['overview'].tolist()]
    
    # Convert all text to string and lowercase
    movies['overview'] = movies['overview'].str.lower()
    
    # Remove leading and trailing whitespaces
    movies['overview'] = movies['overview'].str.strip()
    
     # Apply the function to stem and remove stopwords
    movies['overview'] = movies.apply(lambda row: stem_stop_text(row['overview']), axis=1)
    
    return movies

In [26]:
# Apply the above function to clean the overview field and produce an updated movie dataframe
movies = clean_overview(movies)

#### Clean popularity

In [27]:
# There are 3 movies without a popularity score
print(sum(movies.popularity.isna()))

# These movies have no recorded votes, so we'll fill the NaN values with 0
print(movies[movies.popularity.isna()]['vote_count'])

# This field is currently in object form, so we'll cas to floats
print(movies.popularity.dtypes)

3
19729   NaN
29502   NaN
35586   NaN
Name: vote_count, dtype: float64
object


In [28]:
def clean_popularity(movies):
    """Replace NaN values and convert to floating point type"""
    # Fill NaN values with 0
    movies.popularity.fillna(0, inplace=True)
    
    # Cast to floating point type
    movies['popularity'] = pd.to_numeric(movies['popularity'])
    
    return movies

In [29]:
# Apply the above function to clean the popularity field and produce an updated movie dataframe
movies = clean_popularity(movies)

#### Clean production
In this section, we'll extract production company names and countries from dictionaries, forming a single text string for both fields.

In [30]:
# There are some NaN values for production company, as well as many empty list values
# Both NaN and empty lists will be replaced with the string 'unknown'
print(len(movies[movies['production_companies'].isna()]))
print(len(movies[movies['production_companies'] == '[]']))

# There are some NaN values for production countries, as well as many empty list values
# Both NaN and empty lists will be replaced with the string 'unknown'
print(len(movies[movies['production_countries'].isna()]))
print(len(movies[movies['production_countries'] == '[]']))

3
11875
3
6282


In [31]:
def clean_production(movies):
    """Form a cleaned column of production company(ies) data and another for production country(ies)"""
    
    # Replace empty production companies and countries cells with lists as an empty string
    movies['production_companies'].fillna('[]', inplace=True)
    movies['production_countries'].fillna('[]', inplace=True)
    
    # Extract the name fields for production companies and countries
    movies['production_companies'] = [" ".join(list(set([y['name'] for y in x]))
                                       ) for x in movies['production_companies'].apply(ast.literal_eval)]
    movies['production_countries'] = [" ".join(list(set([y['iso_3166_1'] for y in x]))
                                       ) for x in movies['production_countries'].apply(ast.literal_eval)]
    
    # Strip leading and trailing whitespace values
    movies['production_companies'] = movies['production_companies'].str.strip()
    movies['production_countries'] = movies['production_countries'].str.strip()
    
    # Replace empty string values with 'unknown'
    movies['production_companies'].replace(r'^\s*$', 'unknown', regex=True, inplace=True)
    movies['production_countries'].replace(r'^\s*$', 'unknown', regex=True, inplace=True)
    
    # Convert both columns to lowercase
    movies['production_companies'] = movies['production_companies'].str.lower()
    movies['production_countries'] = movies['production_countries'].str.lower()
    
    return movies

In [32]:
# Apply the above function to clean the production company and country fields and return an updated dataframe
movies = clean_production(movies)

#### Clean release_date

In [33]:
# There are 87 movies remaining in the dataset without a release_date
print(len(movies[movies.release_date.isna()]))

# However, some of these have a status of released, so we'll use a dummy value to replace NaNs (1/1/1900)
print(movies[movies.release_date.isna()]['status'].unique())

87
['Released' nan 'Planned' 'Canceled' 'Post Production' 'In Production']


In [34]:
def clean_release_date(movies):
    """Convert release date to release year and keep the Imdb id with the most recent release year if duplicated"""
    
    # Replace NaN values with the dummy value
    movies['release_date'].fillna('1900-01-01', inplace=True)
    
    # Convert the release date field to a datetime type
    movies['release_date'] = movies.apply(lambda row: datetime.strptime(row['release_date'], '%Y-%m-%d'), axis=1)
    
    # Extract the release year as a new column
    movies['release_year'] = pd.DatetimeIndex(movies['release_date']).year
    
    # Drop rows with duplicated imdb ids, keeping the row with the most recent value
    # Per Jerico's ratings EDA, the majority of ratings are in the last several years, so we'll keep the most recent value
    movies.sort_values('release_date').drop_duplicates('imdb_id',keep='last', inplace=True)
    
    # Drop the release date field and only keep release year
    movies.drop('release_date', axis=1, inplace=True)
    
    return movies

In [35]:
# Apply the above function to return an udpated movies dataframe with a cleaned release date column
movies = clean_release_date(movies)

#### Clean runtime

In [36]:
# There are 260 movies without a runtime provided
print(sum(movies.runtime.isna()))

# The shortest length movie is 0 min, the average movie length is around 94 min, and the longest is 1,256 min
print(min(movies.runtime), np.mean(movies.runtime), max(movies.runtime))

260
0.0 94.12819945578833 1256.0


In [37]:
def clean_runtime(movies):
    """Convert the runtime field to numeric data"""
    
    # For movies without a runtime, use -1 as a placeholder to differentiate from movies with an actual 0 min length
    movies['runtime'].fillna(-1, inplace=True)
    
    # Convert the field to numeric type
    movies['runtime'] = pd.to_numeric(movies['runtime'])
    
    return movies

In [38]:
# Update the movies dataset to clean the runtime field
movies = clean_runtime(movies)

#### Clean spoken languages
In this section, we'll extract a list of spoken languages from the dictionary for each row, using the iso_639_1 field for language abbreviations. For rows with an empty list value, 'unknown' will be used as a placeholder.

In [39]:
# There are 3 rows with a NaN value for spoken languages and several thousand rows with an empty list placeholder.
# We'll replace both of those values with 'unknown' in the next step
print(len(movies[movies.spoken_languages.isna()]))
print(len(movies[movies.spoken_languages == '[]']))

3
3829


In [40]:
def clean_spoken_languages(movies):
    """Extract each movie's spoken language(s) as a text string"""
    
    # Replace NaN values with an empty list
    movies['spoken_languages'].fillna('[]', inplace=True)
    
    # Extract all spoken language abbreviations for each movie
    movies['spoken_languages'] = [list(set([y['iso_639_1'] if x != '[]' else 'unknown' for y in x]))
                                  for x in movies['spoken_languages'].apply(ast.literal_eval)]
    
    # Using an instance of a MultiLabelBinarizer object, convert the lists of spoken languages to columns of one-hot encoding
    mlb = MultiLabelBinarizer()
    movies = movies[[x for x in movies.columns if x != 'spoken_languages']].join(
        pd.DataFrame(mlb.fit_transform(movies.pop('spoken_languages')), columns=mlb.classes_, index=movies.index),
        lsuffix="_left", rsuffix="_right")
    
    # Drop the id right column
    del movies['id_right']
    
    #  Rename the id_left column
    movies = movies.rename(columns={'id_left': 'id'})
    
    return movies

In [41]:
# Apply this function to the movie's spoken languages field and update the movies dataset
movies = clean_spoken_languages(movies)

#### Clean status field

In [42]:
# The movies status field contains NaN values and capitalized words that will be converted to lowercase
movies.status.unique()

array(['Released', nan, 'Rumored', 'Post Production', 'In Production',
       'Planned', 'Canceled'], dtype=object)

In [43]:
def clean_status(movies):
    """Add a placeholder for the NaN values and lowercase the responses"""
    
    # Update rows with a production status to remove whitespace
    movies['status'].replace('Post Production', 'post-production', inplace=True)
    movies['status'].replace('In Production', 'in-production', inplace=True)
    
    # Lowercase the text
    movies['status'] = movies['status'].str.lower()
    
    # Using an instance of a MultiLabelBinarizer object, convert the statuses to columns of one-hot encoding
    movies = movies[[x for x in list(movies.columns) if x != 'status']].join(pd.get_dummies(movies.status))
    
    return movies

In [44]:
# Apply the above function to the status field and return an updated movies dataset
movies = clean_status(movies)

#### Clean tagline

In [45]:
# Many movies have no tagline; we'll use unknown as a placeholder
len(movies[movies.tagline.isna()])

25051

In [46]:
def clean_tagline(movies):
    """Clean and process the tagline field"""
    
    # Replace missing values with a placeholder
    movies['tagline'].fillna('unknown', inplace=True)
    
    # Remove non-ascii characters in the overview field
    movies['tagline'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
    
    # Remove punctuation
    p = re.compile(r'[^\w\s]+')
    movies['tagline'] = [p.sub('', x) for x in movies['tagline'].tolist()]
    
    # Convert all text to string and lowercase
    movies['tagline'] = movies['tagline'].str.lower()
    
    # Remove leading and trailing whitespaces
    movies['tagline'] = movies['tagline'].str.strip()
    
     # Apply the function to stem and remove stopwords
    movies['tagline'] = movies.apply(lambda row: stem_stop_text(row['tagline']), axis=1)
    
    return movies

In [47]:
# Apply the above function to clean the movies' tagline field and return an updated dataframe
movies = clean_tagline(movies)

#### Clean the video field to convert to string and replace NaN values with 'unknown'

In [48]:
# There are only 3 movies without a video status, so we'll drop these from the dataframe
len(movies[movies.video.isna()])

3

In [49]:
def clean_video(movies):
    
    # Drop the 3 rows with missing values
    movies = movies[movies.video.notnull()]
    
    # Convert to one-hot encoding 
    movies['video'].replace({False: 0, True: 1}, inplace=True)
    
    return movies

In [50]:
# Apply the above function to fill missing values in the video field
movies = clean_video(movies)

#### Clean the vote related fields (vote_average and vote_count)

In [51]:
# There are no movies without an average for votes
movies[movies.vote_average.isna()]

Unnamed: 0,imdb_id,id,adult,belongs_to_collection,budget,originally_english,overview,popularity,production_companies,production_countries,...,xx,yi,zh,zu,canceled,in-production,planned,post-production,released,rumored


In [52]:
# There are no movies without a vote count value
movies[movies.vote_count.isna()]

Unnamed: 0,imdb_id,id,adult,belongs_to_collection,budget,originally_english,overview,popularity,production_companies,production_countries,...,xx,yi,zh,zu,canceled,in-production,planned,post-production,released,rumored


In [53]:
# The vote fields are already in float form, so we won't need to convert to numeric
print(movies.vote_average.dtypes)
print(movies.vote_count.dtypes)

float64
float64


In [54]:
# Convert the id field to integer format, to enable a join
movies['id'] = pd.to_numeric(movies['id'])

# Join the cleaned version of movies with the credits and keywords in a left join
movies_temp = movies.merge(credits_keywords,on='id', how='left')

# Drop the Unnamed:0 column
movies_temp.drop('Unnamed: 0', axis=1, inplace=True)

# Replace NaN with unknown
movies_temp.fillna('unknown', inplace=True)

# Set the index as the Imdb id column
movies_temp = movies_temp.set_index('imdb_id')

# Write this final cleaned dataset to a csv file
movies_temp.to_csv('C:/Users/britt/Desktop/W207/final_project/data/movies_temp.csv')