In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import *
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import KernelPCA
import time

In [2]:
# Read in the cleaned version of the movies metadata
df = pd.read_csv('C:/Users/britt/Desktop/W207/final_project/data/movies_temp.csv')

In [3]:
# There are 3 Imdb duplicates which were not resolved by choosing the latest release date
# We'll see which columns differ between duplicates in the next step
df[df.duplicated(['imdb_id'])]

Unnamed: 0,imdb_id,id,adult,belongs_to_collection,budget,originally_english,overview,popularity,production_companies,production_countries,...,zu,canceled,in-production,planned,post-production,released,rumored,cast_names,crew_names,description
15885,tt0100361,69234,0,0,10000000,1,count de chagni discov christin sing talent ma...,0.43849,hexatel tf1 beta film reteitalia saban interna...,us fr de it,...,0,0,0,0,0,1,0,jeanpierrecassel ianrichardson adamstorke burt...,jacquesbufnoir rossmilloy arthurkopit steveyac...,mask miniseri phantom opera remak gondola pari...
23985,tt0270288,4912,0,0,30000000,1,televis made famou biggest hit happen screen t...,11.331072,miramax films allied filmmakers mad chance,us,...,0,0,0,0,0,1,0,jayepmorgan maggiegyllenhaal davidjulianhirsh ...,ellenchenoweth renéeapril harveyweinstein geor...,silenc biographi microfilm intrigu
42592,tt2818654,298721,0,0,980000,0,hospit ten soldier treat mysteri sleep sick st...,2.535419,zdf/arte detalle films centre national de la c...,my th mx no kr gb us fr de,...,0,0,0,0,0,1,0,banloplomnoi pongsadhornlertsukon jarinpattrar...,akekarathomlaor richardhocks ericvogel leechat...,ill sleep soldier river cemeteri


In [4]:
# Looking at columns of duplicated imdb ids provides no insight into which columns have different values for the same imdb id
ids = df['imdb_id']
df[ids.isin(ids[ids.duplicated()])].sort_values('imdb_id')

Unnamed: 0,imdb_id,id,adult,belongs_to_collection,budget,originally_english,overview,popularity,production_companies,production_countries,...,zu,canceled,in-production,planned,post-production,released,rumored,cast_names,crew_names,description
15884,tt0100361,69234,0,0,10000000,1,count de chagni discov christin sing talent ma...,0.43849,hexatel tf1 beta film reteitalia saban interna...,us fr de it,...,0,0,0,0,0,1,0,jeanpierrecassel ianrichardson adamstorke burt...,rossmilloy arthurkopit steveyaconelli gastonle...,mask miniseri phantom opera remak gondola pari...
15885,tt0100361,69234,0,0,10000000,1,count de chagni discov christin sing talent ma...,0.43849,hexatel tf1 beta film reteitalia saban interna...,us fr de it,...,0,0,0,0,0,1,0,jeanpierrecassel ianrichardson adamstorke burt...,jacquesbufnoir rossmilloy arthurkopit steveyac...,mask miniseri phantom opera remak gondola pari...
23984,tt0270288,4912,0,0,30000000,1,televis made famou biggest hit happen screen t...,11.331072,miramax films allied filmmakers mad chance,us,...,0,0,0,0,0,1,0,jayepmorgan maggiegyllenhaal davidjulianhirsh ...,ellenchenoweth renéeapril harveyweinstein jona...,silenc biographi microfilm intrigu
23985,tt0270288,4912,0,0,30000000,1,televis made famou biggest hit happen screen t...,11.331072,miramax films allied filmmakers mad chance,us,...,0,0,0,0,0,1,0,jayepmorgan maggiegyllenhaal davidjulianhirsh ...,ellenchenoweth renéeapril harveyweinstein geor...,silenc biographi microfilm intrigu
42591,tt2818654,298721,0,0,980000,0,hospit ten soldier treat mysteri sleep sick st...,2.535419,zdf/arte detalle films centre national de la c...,my th mx no kr gb us fr de,...,0,0,0,0,0,1,0,banloplomnoi pongsadhornlertsukon jarinpattrar...,akekarathomlaor richardhocks leechatametikool ...,ill sleep soldier river cemeteri
42592,tt2818654,298721,0,0,980000,0,hospit ten soldier treat mysteri sleep sick st...,2.535419,zdf/arte detalle films centre national de la c...,my th mx no kr gb us fr de,...,0,0,0,0,0,1,0,banloplomnoi pongsadhornlertsukon jarinpattrar...,akekarathomlaor richardhocks ericvogel leechat...,ill sleep soldier river cemeteri


In [5]:
# Create dataframes of the 3 imdb ids that are duplicated
first_dup = df[df.imdb_id == 'tt0100361']
second_dup = df[df.imdb_id == 'tt0270288']
third_dup = df[df.imdb_id == 'tt2818654']

In [6]:
def cols_with_diff_values(df):
    """Return columns where values differ"""
    
    # Create a list to hold column names with differing values
    # Loop over the list of columns for each dataframe
    # If the column has more than one unique value, including Na values, add the column name to the list
    differing_cols = [col for col in df.columns if df[col].nunique(dropna=False) > 1]
    
    # Return a copy of the differing columns from the original dataframe
    return df[differing_cols].copy()

# Find the differinng columns for each of the 3 remaining duplicated imdb ids
first_dup_diff = cols_with_diff_values(first_dup)
second_dup_diff = cols_with_diff_values(second_dup)
third_dup_diff = cols_with_diff_values(third_dup)

In [7]:
# These remaining duplicates only differ across their crew names column, so we'll combine entries and remove duplicate rows
print(first_dup_diff, '\n')
print(second_dup_diff, '\n')
print(third_dup_diff)

                                              crew_names
15884  rossmilloy arthurkopit steveyaconelli gastonle...
15885  jacquesbufnoir rossmilloy arthurkopit steveyac... 

                                              crew_names
23984  ellenchenoweth renéeapril harveyweinstein jona...
23985  ellenchenoweth renéeapril harveyweinstein geor... 

                                              crew_names
42591  akekarathomlaor richardhocks leechatametikool ...
42592  akekarathomlaor richardhocks ericvogel leechat...


In [8]:
def reduce_column_values(df):
    """A function that consolidates unique column values and assigns these unique values to each row"""
    
    # Consolidate the list of crew names, remove duplicates, and return as a text string
    # Replace the original crew names with this list of values
    df['crew_names'] = " ".join(list(np.unique(np.array([x.split() for x in df.crew_names.unique()]).ravel())))
    
    # Now that the rows have all the same values, keep only the first of the duplicated rows
    df = df.drop_duplicates(subset = ['imdb_id'], keep = 'first')
    
    return df

In [9]:
# Drop the rows with the duplicated imdb ids from the original dataframe
df = df[~df.imdb_id.isin(['tt0100361', 'tt0270288', 'tt2818654'])]

# Consolidate crew name values and drop duplicated rows for each of the 3 imdb ids that are duplicated
cleaned_1st_dup = reduce_column_values(first_dup)
cleaned_2nd_dup = reduce_column_values(second_dup)
cleaned_3rd_dup = reduce_column_values(third_dup)

# Rejoin the cleaned, de-duplicated rows to the main dataframe
df = pd.concat([df, cleaned_1st_dup, cleaned_2nd_dup, cleaned_3rd_dup])

In [10]:
# Find the proportion of movies with an unknown tagline
# Since more than half of movies in the dataset don't have a tagline, we'll drop this field
print(len(df[df.tagline == 'unknown'])/(len(df)))

# Find the proportion of movies with unknown overview - only about 2% of movies are missing this field
print(len(df[df.overview == 'unknown'])/(len(df)))

# Find the proportion of movies with unknown description - ~ one-third of movies don't have a result for this field
# Since the overview field also contains a description, but contains fewer missing values, we'll use this field
print(len(df[df.description == 'unknown'])/(len(df)))

# Find the proportion of movies without production companies or production countries listed
# Nearly a third of films in the dataset don't have a value for production companies or production countries
# We'll drop these fields
print(len(
    (df[(df.production_companies == 'unknown') | (df.production_countries == 'unknown')]))
                                                     /len(df))

# Only a small proportion of films are missing cast or crew names, so we'll keep these fields
print(len(
    (df[(df.cast_names == 'unknown') | (df.crew_names == 'unknown')]))
                                                     /len(df))

0.5510536630480259
0.021315482350868695
0.31552639112148506
0.27230088300706845
0.060753528725254885


In [11]:
# Remove fields tagline, description, production_countries, production_companies
df.drop(['tagline', 'description', 'production_countries', 'production_companies'], axis=1, inplace=True)

In [12]:
# Create an instance of a TfidfVectorizer object for overview
# Set it to keep the top 200 most significant words in the overview column
tfidf_overview = TfidfVectorizer(max_features=200)
t_overview = tfidf_overview.fit_transform(df.overview)

# Create a dataframe of the transformed top 200 overview features
overview = pd.DataFrame(t_overview.todense()).add_prefix('overview_')

# Fill the NA values in the title column with unknown
df.title.fillna('unknown', inplace=True)

# Create an instance of a TfidfVectorizer object for title
# Set it to keep the top 200 most significant words in the title column
tfidf_title = TfidfVectorizer(max_features=200)
t_title = tfidf_title.fit_transform(df.title)

# Create a dataframe of the transformed top 200 overview features
title = pd.DataFrame(t_title.todense()).add_prefix('title_')

# Create an instance of a TfidfVectorizer object for cast names, keeping the 100 most significant cast names
tfidf_cast = TfidfVectorizer(max_features=100)
t_cast = tfidf_cast.fit_transform(df.cast_names)

# Create a dataframe of the transformed top 100 cast name features
cast = pd.DataFrame(t_cast.todense()).add_prefix('cast_')

# Create an instance of a TfidfVectorizer for crew names, keeping the 100 most significant crew names
tfidf_crew = TfidfVectorizer(max_features=100)
t_crew = tfidf_crew.fit_transform(df.crew_names)

# Create a dataframe of the transformed top 100 crew name features
crew = pd.DataFrame(t_crew.todense()).add_prefix('crew_')

In [13]:
# Concatenate these columns into a single dataframe, join to a copy of df, and drop the original text columns
text = pd.concat([overview, title, cast, crew], axis=1)

In [26]:
# Create a copy of the original dataframe and drop the text columns that have been converted to numeric scores
df1 = df.copy(deep=True)
df1.drop(['overview', 'title', 'cast_names', 'crew_names'], axis=1, inplace=True)

# Create a new dataframe with the numeric representations of the original text added back in
df2 = pd.concat([df1, text], axis=1)

# Drop rows without an imdb id value
df2.dropna(subset = ['imdb_id'], inplace=True)

# Drop the id column and set the imdb_id as the index
df2.drop(columns=['id'], inplace=True)
df2 = df2.set_index('imdb_id')

# Fill Na values with 0
df2.fillna(0, inplace=True)

In [30]:
# Create an array of the values in the dataframe
x = df2.iloc[:,0:].values

# Create an instance of a Standard Scaler and fit it to the numeric data
sc = StandardScaler()
features = sc.fit_transform(x)

The explained variance is not high until a large number of features are included. 

In [35]:
# Use PCA to reduce the number of dimensions and check the explained variance sum
print("Running PCA on Movie Features... \n")
ncomp = 500
pca = PCA(n_components=ncomp)
pca_features = pca.fit_transform(features) 
pca_variance = pca.explained_variance_ratio_.sum()
pca_variance

Running PCA on Movie Features... 



0.7226671107001129

In [36]:
print(abs( pca.components_ ))

[[0.00359718 0.1993121  0.41166657 ... 0.02343909 0.00179782 0.01898034]
 [0.0016102  0.00257613 0.05125161 ... 0.08998372 0.00542625 0.01507299]
 [0.00851381 0.0318256  0.11610717 ... 0.03712925 0.00719809 0.07516153]
 ...
 [0.03066207 0.02226225 0.00241991 ... 0.00826829 0.00081035 0.00531281]
 [0.0788904  0.0614561  0.01479562 ... 0.01989242 0.04613521 0.00448436]
 [0.00232013 0.01309839 0.01067693 ... 0.0045262  0.00715663 0.00066669]]
