In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
from collections import Counter

In [2]:
df = pd.read_csv("mini_project/data.csv")

In [3]:
df.shape

(3000, 23)

# Exploring data

In [4]:
df.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [5]:
df['belongs_to_collection'][0]

"[{'id': 313576, 'name': 'Hot Tub Time Machine Collection', 'poster_path': '/iEhb00TGPucF0b4joM1ieyY026U.jpg', 'backdrop_path': '/noeTVcgpBiD48fDjFVic1Vz7ope.jpg'}]"

In [6]:
df['belongs_to_collection'] = df['belongs_to_collection'].replace(np.nan,"",regex=True)
df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: literal_eval(x) if x!='' else [])
        

In [7]:
df['belongs_to_collection'].apply(lambda x: len(x) if x != [] else 0).value_counts()

0    2396
1     604
Name: belongs_to_collection, dtype: int64

In [8]:
df['genres'] = df['genres'].replace(np.nan,"",regex=True)
df['genres'] = df['genres'].apply(lambda x: literal_eval(x) if x!='' else [])

In [9]:
df['genres'].apply(lambda x: len(x) if x != [] else 0).value_counts()

2    972
3    900
1    593
4    393
5    111
6     21
0      7
7      3
Name: genres, dtype: int64

In [10]:
list_of_genres = list(df['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [11]:
list_of_genres

[['Comedy'],
 ['Comedy', 'Drama', 'Family', 'Romance'],
 ['Drama'],
 ['Thriller', 'Drama'],
 ['Action', 'Thriller'],
 ['Animation', 'Adventure', 'Family'],
 ['Horror', 'Thriller'],
 ['Documentary'],
 ['Action', 'Comedy', 'Music', 'Family', 'Adventure'],
 ['Comedy', 'Music'],
 ['Drama'],
 ['Comedy'],
 ['Drama'],
 ['Comedy', 'Crime'],
 ['Action', 'Thriller', 'Science Fiction', 'Mystery'],
 ['Action', 'Crime', 'Drama'],
 ['Horror', 'Thriller'],
 ['Drama', 'Romance'],
 ['Comedy', 'Romance'],
 ['Action', 'Thriller', 'Crime'],
 ['Adventure', 'Family', 'Science Fiction'],
 ['Horror', 'Thriller'],
 ['Thriller', 'Horror'],
 ['Thriller', 'Mystery', 'Foreign'],
 ['Horror', 'Comedy'],
 ['Comedy', 'Horror', 'Mystery', 'Thriller'],
 ['Crime', 'Drama', 'Mystery', 'Thriller'],
 ['Drama', 'Comedy', 'Romance'],
 ['Animation'],
 ['Action', 'Adventure', 'Crime', 'Thriller'],
 ['Drama', 'Comedy'],
 ['Mystery', 'Drama', 'Thriller'],
 ['Fantasy', 'Action', 'Adventure'],
 ['Horror'],
 ['Action', 'Comedy', 'Cr

In [12]:
Counter([i for j in list_of_genres for i in j]).most_common(16)

[('Drama', 1531),
 ('Comedy', 1028),
 ('Thriller', 789),
 ('Action', 741),
 ('Romance', 571),
 ('Crime', 469),
 ('Adventure', 439),
 ('Horror', 301),
 ('Science Fiction', 290),
 ('Family', 260),
 ('Fantasy', 232),
 ('Mystery', 225),
 ('Animation', 141),
 ('History', 132),
 ('Music', 100),
 ('War', 100)]

In [13]:
genres_top = [m[0] for m in Counter([i for j in list_of_genres for i in j]).most_common(16)]

In [16]:
df['genre_count'] = df['genres'].apply(lambda x: len(x) if x != [] else 0)
df['genre_list'] = df['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != [] else '')

In [17]:
for i in genres_top:
    df[i+'_genre'] = df['genre_list'].apply(lambda x: 1 if i in x else 0)

In [18]:
df = df.drop(['genres'], axis=1)

In [19]:
df['production_companies'] = df['production_companies'].replace(np.nan,"",regex=True)
df['production_companies'] = df['production_companies'].apply(lambda x: literal_eval(x) if x!='' else [])

In [20]:
df['production_companies'].apply(lambda x: len(x) if x != {} else 0).value_counts()

1     775
2     734
3     582
4     312
5     166
0     156
6     118
7      62
8      42
9      29
11      7
10      7
12      3
16      2
15      2
14      1
13      1
17      1
Name: production_companies, dtype: int64

In [21]:
company_list = list(df['production_companies'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)

In [22]:
Counter([i for j in company_list for i in j]).most_common(40)

[('Warner Bros.', 202),
 ('Universal Pictures', 188),
 ('Paramount Pictures', 161),
 ('Twentieth Century Fox Film Corporation', 138),
 ('Columbia Pictures', 91),
 ('Metro-Goldwyn-Mayer (MGM)', 84),
 ('New Line Cinema', 75),
 ('Touchstone Pictures', 63),
 ('Walt Disney Pictures', 62),
 ('Columbia Pictures Corporation', 61),
 ('TriStar Pictures', 53),
 ('Relativity Media', 48),
 ('Canal+', 46),
 ('United Artists', 44),
 ('Miramax Films', 40),
 ('Village Roadshow Pictures', 36),
 ('Regency Enterprises', 31),
 ('BBC Films', 30),
 ('Dune Entertainment', 30),
 ('Working Title Films', 30),
 ('Fox Searchlight Pictures', 29),
 ('StudioCanal', 28),
 ('Lionsgate', 28),
 ('DreamWorks SKG', 27),
 ('Fox 2000 Pictures', 25),
 ('Summit Entertainment', 24),
 ('Hollywood Pictures', 24),
 ('Orion Pictures', 24),
 ('Amblin Entertainment', 23),
 ('Dimension Films', 23),
 ('Castle Rock Entertainment', 21),
 ('Epsilon Motion Pictures', 21),
 ('Morgan Creek Productions', 21),
 ('Original Film', 21),
 ('Focus 

In [23]:
df['num_companies'] = df['production_companies'].apply(lambda x: len(x) if x != {} else 0)
df['all_production_companies'] = df['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_companies = [m[0] for m in Counter([i for j in company_list for i in j]).most_common(40)]
for g in top_companies:
    df['production_company_' + g] = df['all_production_companies'].apply(lambda x: 1 if g in x else 0)

In [24]:
df = df.drop(['production_companies', 'all_production_companies'], axis=1)

In [25]:
df['production_countries'] = df['production_countries'].replace(np.nan,"",regex=True)
df['production_countries'] = df['production_countries'].apply(lambda x: literal_eval(x) if x!='' else [])

In [26]:
print('Number of countries in production_countries')
df['production_countries'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Number of countries in production_countries


1    2222
2     525
3     116
4      57
0      55
5      21
6       3
8       1
Name: production_countries, dtype: int64

In [27]:
list_of_countries = list(df['production_countries'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)

In [28]:
list_of_countries

[['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['India'],
 ['South Korea'],
 [],
 ['United States of America', 'Canada'],
 [],
 ['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['Serbia'],
 ['United States of America'],
 ['United Kingdom'],
 ['Austria', 'Germany', 'United Kingdom'],
 ['France'],
 ['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['New Zealand'],
 ['Japan', 'United States of America'],
 ['United States of America'],
 ['Canada', 'Ireland'],
 ['United States of America'],
 ['United States of America'],
 ['France'],
 ['Austria', 'France', 'Germany', 'Italy', 'United States of America'],
 ['United States of America'],
 ['United States of America', 'New Zealand', 'Canada', 'Israel', 'Japan']

In [29]:
Counter([i for j in list_of_countries for i in j]).most_common(20)

[('United States of America', 2282),
 ('United Kingdom', 380),
 ('France', 222),
 ('Germany', 167),
 ('Canada', 120),
 ('India', 81),
 ('Italy', 64),
 ('Japan', 61),
 ('Australia', 61),
 ('Russia', 58),
 ('Spain', 54),
 ('China', 42),
 ('Hong Kong', 42),
 ('Ireland', 23),
 ('Belgium', 23),
 ('South Korea', 22),
 ('Mexico', 19),
 ('Sweden', 18),
 ('New Zealand', 17),
 ('Netherlands', 15)]

In [30]:
df['num_countries'] = df['production_countries'].apply(lambda x: len(x) if x != {} else 0)
df['all_countries'] = df['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_countries = [m[0] for m in Counter([i for j in list_of_countries for i in j]).most_common(25)]
for g in top_countries:
    df['production_country_' + g] = df['all_countries'].apply(lambda x: 1 if g in x else 0)

In [31]:
df = df.drop(['production_countries', 'all_countries'], axis=1)

In [32]:
df['spoken_languages'] = df['spoken_languages'].replace(np.nan,"",regex=True)
df['spoken_languages'] = df['spoken_languages'].apply(lambda x: literal_eval(x) if x!='' else [])

In [33]:
list_of_languages = list(df['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)

In [34]:
Counter([i for j in list_of_languages for i in j]).most_common(30)

[('English', 2618),
 ('Français', 288),
 ('Español', 239),
 ('Deutsch', 169),
 ('Pусский', 152),
 ('Italiano', 124),
 ('日本語', 89),
 ('普通话', 68),
 ('हिन्दी', 56),
 ('', 47),
 ('Português', 43),
 ('العربية', 40),
 ('한국어/조선말', 37),
 ('广州话 / 廣州話', 36),
 ('தமிழ்', 27),
 ('Polski', 25),
 ('Magyar', 24),
 ('Latin', 23),
 ('svenska', 22),
 ('ภาษาไทย', 22),
 ('Český', 20),
 ('עִבְרִית', 15),
 ('ελληνικά', 15),
 ('Türkçe', 13),
 ('Dansk', 13),
 ('Nederlands', 10),
 ('فارسی', 10),
 ('Tiếng Việt', 10),
 ('اردو', 8),
 ('Română', 8)]

In [35]:
df['num_languages'] = df['spoken_languages'].apply(lambda x: len(x) if x != {} else 0)
df['all_languages'] = df['spoken_languages'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_languages = [m[0] for m in Counter([i for j in list_of_languages for i in j]).most_common(15)]
for g in top_languages:
    df['language_' + g] = df['all_languages'].apply(lambda x: 1 if g in x else 0)

In [36]:
df = df.drop(['spoken_languages', 'all_languages'], axis=1)

In [37]:
df['Keywords'] = df['Keywords'].replace(np.nan,"",regex=True)
df['Keywords'] = df['Keywords'].apply(lambda x: literal_eval(x) if x!='' else [])

In [38]:
print('Keywords in films')
df['Keywords'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Keywords in films


5      293
0      276
4      248
3      228
6      227
2      207
7      192
1      187
8      161
9      134
11     132
10     125
12     100
13      85
14      59
15      59
16      58
17      45
18      29
20      24
21      23
19      22
22      17
23      15
24      12
27      12
25       6
26       5
32       2
28       2
30       2
31       2
29       2
37       2
43       1
40       1
38       1
33       1
97       1
39       1
149      1
Name: Keywords, dtype: int64

In [39]:
list_of_keywords = list(df['Keywords'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)

In [40]:
Counter([i for j in list_of_keywords for i in j]).most_common()

[('woman director', 175),
 ('independent film', 155),
 ('duringcreditsstinger', 134),
 ('murder', 123),
 ('based on novel', 111),
 ('violence', 87),
 ('sport', 82),
 ('biography', 77),
 ('aftercreditsstinger', 75),
 ('dystopia', 73),
 ('revenge', 73),
 ('friendship', 72),
 ('sex', 71),
 ('suspense', 67),
 ('sequel', 66),
 ('love', 64),
 ('police', 64),
 ('teenager', 61),
 ('nudity', 57),
 ('female nudity', 53),
 ('drug', 52),
 ('prison', 50),
 ('musical', 49),
 ('high school', 49),
 ('los angeles', 48),
 ('new york', 47),
 ('family', 45),
 ('father son relationship', 44),
 ('kidnapping', 41),
 ('investigation', 41),
 ('wedding', 40),
 ('3d', 40),
 ('detective', 38),
 ('london england', 38),
 ('paris', 37),
 ('based on comic', 37),
 ('robbery', 36),
 ('based on true story', 35),
 ('dying and death', 34),
 ('escape', 34),
 ('alien', 34),
 ('brother brother relationship', 34),
 ('prostitute', 34),
 ('suicide', 33),
 ('rape', 33),
 ('corruption', 33),
 ('death', 32),
 ('superhero', 32),
 (

In [41]:
df['num_keywords'] = df['Keywords'].apply(lambda x: len(x) if x != {} else 0)
df['all_keywords'] = df['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_keywords = [m[0] for m in Counter([i for j in list_of_keywords for i in j]).most_common(30)]
for g in top_keywords:
    df['keyword_' + g] = df['all_keywords'].apply(lambda x: 1 if g in x else 0)

In [42]:
df = df.drop(['Keywords', 'all_keywords'], axis=1)

In [43]:
df['cast'] = df['cast'].replace(np.nan,"",regex=True)
df['cast'] = df['cast'].apply(lambda x: literal_eval(x) if x!='' else [])

In [44]:
print('Number of casted actors')
df['cast'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Number of casted actors


15     212
16     165
10     135
13     129
12     124
11     122
9      118
17     118
18     115
14     110
8      106
19     102
7       96
20      93
22      77
6       76
23      71
21      68
5       57
24      57
4       52
25      50
27      44
29      42
26      39
31      37
30      32
28      30
32      26
0       26
      ... 
69       2
98       2
75       2
76       2
117      2
77       2
105      2
80       2
103      1
95       1
81       1
89       1
87       1
85       1
113      1
83       1
115      1
91       1
64       1
156      1
122      1
118      1
112      1
108      1
92       1
90       1
88       1
84       1
82       1
134      1
Name: cast, Length: 109, dtype: int64

In [45]:
list_of_cast_names = list(df['cast'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)
Counter([i for j in list_of_cast_names for i in j]).most_common(15)

[('Samuel L. Jackson', 30),
 ('Robert De Niro', 30),
 ('Morgan Freeman', 27),
 ('J.K. Simmons', 25),
 ('Bruce Willis', 25),
 ('Liam Neeson', 25),
 ('Susan Sarandon', 25),
 ('Bruce McGill', 24),
 ('John Turturro', 24),
 ('Forest Whitaker', 23),
 ('Willem Dafoe', 23),
 ('Bill Murray', 22),
 ('Owen Wilson', 22),
 ('Nicolas Cage', 22),
 ('Sylvester Stallone', 21)]

In [46]:
list_of_cast_genders = list(df['cast'].apply(lambda x: [i['gender'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_cast_genders for i in j]).most_common()

[(2, 27949), (0, 20329), (1, 13533)]

In [47]:
list_of_cast_characters = list(df['cast'].apply(lambda x: [i['character'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_cast_characters for i in j]).most_common(15)

[('', 818),
 ('Himself', 610),
 ('Herself', 155),
 ('Dancer', 144),
 ('Additional Voices (voice)', 100),
 ('Doctor', 77),
 ('Reporter', 70),
 ('Waitress', 69),
 ('Nurse', 65),
 ('Bartender', 55),
 ('Jack', 54),
 ('Debutante', 54),
 ('Security Guard', 50),
 ('Paul', 48),
 ('Frank', 44)]

In [48]:
df['num_cast'] = df['cast'].apply(lambda x: len(x) if x != {} else 0)
top_cast_names = [m[0] for m in Counter([i for j in list_of_cast_names for i in j]).most_common(15)]
for g in top_cast_names:
    df['cast_name_' + g] = df['cast'].apply(lambda x: 1 if g in str(x) else 0)
df['genders_0_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
df['genders_1_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
df['genders_2_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
top_cast_characters = [m[0] for m in Counter([i for j in list_of_cast_characters for i in j]).most_common(15)]
for g in top_cast_characters:
    df['cast_character_' + g] = df['cast'].apply(lambda x: 1 if g in str(x) else 0)

In [49]:
df = df.drop(['cast'], axis=1)

In [50]:
df['crew'] = df['crew'].replace(np.nan,"",regex=True)
df['crew'] = df['crew'].apply(lambda x: literal_eval(x) if x!='' else [])

In [51]:
print('Number of crew members in films')
df['crew'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Number of crew members in films


2      179
11     127
10     126
3      126
12     110
9      109
8      109
14     104
4      101
7       94
16      89
15      85
6       85
13      84
17      82
18      77
5       72
1       64
19      61
22      59
21      47
20      45
24      41
27      37
23      35
28      31
30      31
25      30
31      29
26      26
      ... 
112      2
110      2
102      2
131      1
129      1
149      1
151      1
153      1
161      1
121      1
79       1
114      1
74       1
94       1
113      1
177      1
194      1
172      1
156      1
154      1
150      1
148      1
142      1
134      1
130      1
126      1
122      1
98       1
96       1
183      1
Name: crew, Length: 153, dtype: int64

In [52]:
list_of_crew_names = list(df['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_crew_names for i in j]).most_common(20)

[('Avy Kaufman', 50),
 ('Robert Rodriguez', 44),
 ('Deborah Aquila', 40),
 ('James Newton Howard', 39),
 ('Mary Vernieu', 38),
 ('Steven Spielberg', 37),
 ('Luc Besson', 37),
 ('Jerry Goldsmith', 37),
 ('Francine Maisler', 35),
 ('Tricia Wood', 35),
 ('James Horner', 33),
 ('Kerry Barden', 32),
 ('Bob Weinstein', 30),
 ('Harvey Weinstein', 30),
 ('Janet Hirshenson', 30),
 ('Jane Jenkins', 29),
 ('John Debney', 28),
 ('Hans Zimmer', 28),
 ('John Papsidera', 28),
 ('Francis Ford Coppola', 28)]

In [53]:
list_of_crew_jobs = list(df['crew'].apply(lambda x: [i['job'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_crew_jobs for i in j]).most_common(15)

[('Producer', 6011),
 ('Executive Producer', 3459),
 ('Director', 3225),
 ('Screenplay', 2996),
 ('Editor', 2824),
 ('Casting', 2483),
 ('Director of Photography', 2288),
 ('Original Music Composer', 1947),
 ('Art Direction', 1821),
 ('Production Design', 1650),
 ('Costume Design', 1573),
 ('Writer', 1523),
 ('Set Decoration', 1345),
 ('Makeup Artist', 1108),
 ('Sound Re-Recording Mixer', 970)]

In [54]:
list_of_crew_genders = list(df['crew'].apply(lambda x: [i['gender'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_crew_genders for i in j]).most_common(15)

[(0, 41787), (2, 24898), (1, 6412)]

In [55]:
list_of_crew_departments = list(df['crew'].apply(lambda x: [i['department'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_crew_departments for i in j]).most_common(10)

[('Production', 15887),
 ('Sound', 9319),
 ('Art', 8069),
 ('Crew', 7315),
 ('Writing', 6567),
 ('Costume & Make-Up', 6156),
 ('Camera', 5424),
 ('Directing', 4954),
 ('Editing', 4508),
 ('Visual Effects', 3591)]

In [56]:
list_of_crew_names = df['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values
Counter([i for j in list_of_crew_names for i in j]).most_common(15)

[('Avy Kaufman', 50),
 ('Robert Rodriguez', 44),
 ('Deborah Aquila', 40),
 ('James Newton Howard', 39),
 ('Mary Vernieu', 38),
 ('Steven Spielberg', 37),
 ('Luc Besson', 37),
 ('Jerry Goldsmith', 37),
 ('Francine Maisler', 35),
 ('Tricia Wood', 35),
 ('James Horner', 33),
 ('Kerry Barden', 32),
 ('Bob Weinstein', 30),
 ('Harvey Weinstein', 30),
 ('Janet Hirshenson', 30)]

In [57]:
df['num_crew'] = df['crew'].apply(lambda x: len(x) if x != {} else 0)
top_crew_names = [m[0] for m in Counter([i for j in list_of_crew_names for i in j]).most_common(15)]
for g in top_crew_names:
    df['crew_name_' + g] = df['crew'].apply(lambda x: 1 if g in str(x) else 0)
df['genders_0_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
df['genders_1_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
df['genders_2_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
top_crew_jobs = [m[0] for m in Counter([i for j in list_of_crew_jobs for i in j]).most_common(15)]
for j in top_crew_jobs:
    df['jobs_' + j] = df['crew'].apply(lambda x: sum([1 for i in x if i['job'] == j]))
top_crew_departments = [m[0] for m in Counter([i for j in list_of_crew_departments for i in j]).most_common(15)]
for j in top_crew_departments:
    df['departments_' + j] = df['crew'].apply(lambda x: sum([1 for i in x if i['department'] == j])) 