In [1]:
# importing required libraries
import pandas as pd
import re

In [2]:
# reading data
movies = pd.read_csv('movies_data.csv')

In [3]:
# getting numbers or rows and columns in dataframe
movies.shape

(45466, 4)

In [4]:
# displaying top 5 rows in dataframe
movies.head()

Unnamed: 0,genres,overview,spoken_languages,title
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...","[{'iso_639_1': 'en', 'name': 'English'}]",Toy Story
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Jumanji
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,"[{'iso_639_1': 'en', 'name': 'English'}]",Grumpier Old Men
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...","[{'iso_639_1': 'en', 'name': 'English'}]",Waiting to Exhale
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,"[{'iso_639_1': 'en', 'name': 'English'}]",Father of the Bride Part II


In [5]:
# We would only be considering unique movie titles as part of this project
# Duplicate titles will be dropped
movies = movies.drop_duplicates(subset = "title")
movies.shape

(42276, 4)

In [6]:
# As genres and spoken_languages are in string formats and shown as a dictionary, below we would be rewriting them as string formats
# We would drop ids and only keep main values for genres and languages

def get_distinct_names(details):
    data = []
    details = str(details)
    for i in range(1, len(details.split(',')),2):
        data.append(re.sub(r'[^a-zA-Z]', '', details.split(',')[i]).partition('name')[-1])

    return data

In [7]:
# get the list of languages and genres in more systematic format

movies['spoken_languages'] = movies['spoken_languages'].apply(get_distinct_names)
movies['genres'] = movies['genres'].apply(get_distinct_names)

movies.head(10)

Unnamed: 0,genres,overview,spoken_languages,title
0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",[English],Toy Story
1,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,"[English, Franais]",Jumanji
2,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,[English],Grumpier Old Men
3,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",[English],Waiting to Exhale
4,[Comedy],Just when George Banks has recovered from his ...,[English],Father of the Bride Part II
5,"[Action, Crime, Drama, Thriller]","Obsessive master thief, Neil McCauley leads a ...","[English, Espaol]",Heat
6,"[Comedy, Romance]",An ugly duckling having undergone a remarkable...,"[Franais, English]",Sabrina
7,"[Action, Adventure, Drama, Family]","A mischievous young boy, Tom Sawyer, witnesses...","[English, Deutsch]",Tom and Huck
8,"[Action, Adventure, Thriller]",International action superstar Jean Claude Van...,[English],Sudden Death
9,"[Adventure, Action, Thriller]",James Bond must unmask the mysterious head of ...,"[English, P, Espaol]",GoldenEye


In [8]:
# getting a list of all genres included as part of dataset
all_genres = []
def get_all_genres(genre_list):
    for item in genre_list:
        if item not in all_genres and item!='':
            all_genres.append(item)

In [9]:
movies['genres'].apply(get_all_genres)
all_genres

['Animation',
 'Comedy',
 'Family',
 'Adventure',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'History',
 'ScienceFiction',
 'Mystery',
 'War',
 'Foreign',
 'Music',
 'Documentary',
 'Western',
 'TVMovie']

In [10]:
# getting total count of distinct genres
len(all_genres)

20

In [11]:
# checking if the movie belongs to that genre
def genre_mapping(lists, item):
    if item in lists:
        return 1
    else:
        return 0

In [12]:
# displaying each genre column as a categorical variable and using them as columns with binary data
for item in all_genres:
    movies[item] = movies['genres'].apply(genre_mapping, item=item)

In [13]:
# making all the titles to lower case of ease of computation in further model development
movies['title'] = movies['title'].str.lower()

In [14]:
# displaying the final structure of data after pre-processing applied
movies.head()

Unnamed: 0,genres,overview,spoken_languages,title,Animation,Comedy,Family,Adventure,Fantasy,Romance,...,Horror,History,ScienceFiction,Mystery,War,Foreign,Music,Documentary,Western,TVMovie
0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",[English],toy story,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,"[English, Franais]",jumanji,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,[English],grumpier old men,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",[English],waiting to exhale,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,[Comedy],Just when George Banks has recovered from his ...,[English],father of the bride part ii,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# exporting the processed data as csv file.
movies.to_csv('movies.csv', index=False)