# Data Pre-Processing 

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies=pd.read_csv("movie_metadata.csv")
movies.head(2)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-

Looks like a decent amount of missing values here...

# Converting to Numeric Values

In [4]:
movies.color = movies.color.map({'Color': 1, ' Black and White':0})

In [5]:
unique_genre_labels = set()
for genre_flags in movies.genres.str.split('|').values:
    unique_genre_labels = unique_genre_labels.union(set(genre_flags))
for label in unique_genre_labels:
    movies['Genre = '+label] = movies.genres.str.contains(label).astype(int)
movies = movies.drop('genres', axis=1)

In [6]:
movies.head(2)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,actor_1_name,...,Genre = Biography,Genre = Action,Genre = Musical,Genre = News,Genre = Animation,Genre = Romance,Genre = Crime,Genre = Adventure,Genre = Drama,Genre = Western
0,1.0,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,CCH Pounder,...,0,1,0,0,0,0,0,1,0,0
1,1.0,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Johnny Depp,...,0,1,0,0,0,0,0,1,0,0


In [7]:
movies["movie_title"].nunique()

4917

5043 entries out of which 4917 are unique. Have to get rid of the duplicate values.

In [8]:
movies = movies.drop_duplicates(subset=['movie_title'])
movies.shape

(4917, 53)

In [9]:
counts = movies.language.value_counts()
movies.language = movies.language.map(counts)

count = movies.country.value_counts()
movies.country = movies.country.map(count)

counts = movies.content_rating.value_counts()
movies.content_rating = movies.content_rating.map(counts)

unique_words = set()
for wordlist in movies.plot_keywords.str.split('|').values:
    if wordlist is not np.nan:
        unique_words = unique_words.union(set(wordlist))
plot_wordbag = list(unique_words)
for word in plot_wordbag:
    movies['plot_has_' + word.replace(' ', '-')] = movies.plot_keywords.str.contains(word).astype(float)
movies = movies.drop('plot_keywords', axis=1)

movies.director_name = movies.director_name.map(movies.director_name.value_counts())

counts = pd.concat([movies.actor_1_name, movies.actor_2_name, movies.actor_3_name]).value_counts()

movies.actor_1_name = movies.actor_1_name.map(counts)
movies.actor_2_name = movies.actor_2_name.map(counts)
movies.actor_3_name = movies.actor_3_name.map(counts)

movies = movies.drop(['movie_imdb_link'], axis=1) #Not required
movies.shape

  app.launch_new_instance()


(4917, 8134)

From 4917x53 to 4917x8134! Wow.

# Adios Missing Values

In [10]:
missing_df = movies.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['filling_factor'] = (movies.shape[0] 
                                - missing_df['missing_count']) / movies.shape[0] * 100
missing_df = missing_df.sort_values('filling_factor').reset_index(drop = True)
missing_df

#Filling factor as the name suggests indicates the % of every column that is filled

Unnamed: 0,column_name,missing_count,filling_factor
0,gross,863,82.448648
1,budget,484,90.156600
2,aspect_ratio,326,93.369941
3,content_rating,300,93.898719
4,plot_has_kangaroo,152,96.908684
5,plot_has_kosher,152,96.908684
6,plot_has_sketch-comedy,152,96.908684
7,plot_has_mother,152,96.908684
8,plot_has_charles-ii,152,96.908684
9,plot_has_small-western-town,152,96.908684


In [11]:
#Using median of each column to deal with the missing values

movies.fillna(movies.median(), inplace=True)

missing_df = movies.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['filling_factor'] = (movies.shape[0] 
                                - missing_df['missing_count']) / movies.shape[0] * 100
missing_df = missing_df.sort_values('filling_factor').reset_index(drop = True)
missing_df

Unnamed: 0,column_name,missing_count,filling_factor
0,color,0,100.0
1,plot_has_martial-arts-master,0,100.0
2,plot_has_train-crash,0,100.0
3,plot_has_public-access,0,100.0
4,plot_has_cabinet-meeting,0,100.0
5,plot_has_small-western-town,0,100.0
6,plot_has_heist,0,100.0
7,plot_has_mother,0,100.0
8,plot_has_sketch-comedy,0,100.0
9,plot_has_kosher,0,100.0


In [12]:
movies.isnull().sum().sum()

0

Great! Not a single missing value. Let's get on with the recommendation engine.