In [342]:
import pandas as pd
%matplotlib inline

In [343]:
df = pd.read_csv('tmdb-movies.csv')

To get a list of columns

In [344]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10866 entries, 0 to 10865
Data columns (total 21 columns):
id                      10866 non-null int64
imdb_id                 10856 non-null object
popularity              10866 non-null float64
budget                  10866 non-null int64
revenue                 10866 non-null int64
original_title          10866 non-null object
cast                    10790 non-null object
homepage                2936 non-null object
director                10822 non-null object
tagline                 8042 non-null object
keywords                9373 non-null object
overview                10862 non-null object
runtime                 10866 non-null int64
genres                  10843 non-null object
production_companies    9836 non-null object
release_date            10866 non-null object
vote_count              10866 non-null int64
vote_average            10866 non-null float64
release_year            10866 non-null int64
budget_adj              1

Just before proceeding with any thing, let's drop all dumpicates

In [345]:
duplicate_records = sum(df.duplicated())
df.drop_duplicates(inplace=True)
print('{} duplicate records have been deleted'.format(duplicate_records))

1 duplicate records have been deleted


From the `df.info()` does show that there is a number of missing values, to show how many are missing:

In [346]:
df.isnull().sum().sort_values(ascending=False)

homepage                7929
tagline                 2824
keywords                1493
production_companies    1030
cast                      76
director                  44
genres                    23
imdb_id                   10
overview                   4
popularity                 0
budget                     0
revenue                    0
original_title             0
revenue_adj                0
budget_adj                 0
runtime                    0
release_date               0
vote_count                 0
vote_average               0
release_year               0
id                         0
dtype: int64

Some of these columns are not needed, no need to waste any resource processing them: such as imdb_id, homepage

In [347]:
df.drop(labels=['imdb_id', 'homepage', 'tagline', 'overview', 'keywords', 'production_companies', 'id'], axis=1, inplace=True)

In [348]:
df_clean = df.dropna(axis=0)
df.shape[0] - df_clean.shape[0]

134

In [349]:
# To get any abnormalities:
df_clean.describe()

Unnamed: 0,popularity,budget,revenue,runtime,vote_count,vote_average,release_year,budget_adj,revenue_adj
count,10731.0,10731.0,10731.0,10731.0,10731.0,10731.0,10731.0,10731.0,10731.0
mean,0.652615,14803650.0,40319890.0,102.468829,219.812972,5.96471,2001.259622,17765300.0,52006230.0
std,1.004804,31064560.0,117652400.0,30.493873,578.815324,0.930283,12.820151,34466300.0,145425200.0
min,0.000188,0.0,0.0,0.0,10.0,1.5,1960.0,0.0,0.0
25%,0.210765,0.0,0.0,90.0,17.0,5.4,1995.0,0.0,0.0
50%,0.387081,0.0,0.0,99.0,39.0,6.0,2006.0,0.0,0.0
75%,0.720889,16000000.0,25000000.0,112.0,148.0,6.6,2011.0,21108850.0,34705460.0
max,32.985763,425000000.0,2781506000.0,900.0,9767.0,9.2,2015.0,425000000.0,2827124000.0


Let's see how many unique cast we have:

In [350]:
s = set()
for cast in df_clean['cast']:
    s.update(cast.split('|'))
print("There are: {num_casts} unique cast".format(num_casts=len(s)))

There are: 18930 unique cast


One initial idea I had is to add 18930 column to the dataframe, the index name of those are the cast names, and values are boolean which represent whether the cast has been in the movie or not. But let's be honest, such details are useless. Another approach, let's see how many casts listed per each movie:

In [351]:
s = set()
for cast in df_clean['cast']:
    s.update([len(cast.split('|'))])
print(s)

{1, 2, 3, 4, 5}


So, the number of casts listed for each movie is between 1 and 5.

In [352]:
def f(text):
    if len(text.split('|')) == 1:
        return text

x = [df_clean['cast'].apply(f)]

In [353]:
df_clean.head()

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,runtime,genres,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,124,Action|Adventure|Science Fiction|Thriller,6/9/15,5562,6.5,2015,137999900.0,1392446000.0
1,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,120,Action|Adventure|Science Fiction|Thriller,5/13/15,6185,7.1,2015,137999900.0,348161300.0
2,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,119,Adventure|Science Fiction|Thriller,3/18/15,2480,6.3,2015,101200000.0,271619000.0
3,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,136,Action|Adventure|Science Fiction|Fantasy,12/15/15,5292,7.5,2015,183999900.0,1902723000.0
4,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,137,Action|Crime|Thriller,4/1/15,2947,7.3,2015,174799900.0,1385749000.0


In [354]:
df_clean

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,runtime,genres,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,124,Action|Adventure|Science Fiction|Thriller,6/9/15,5562,6.5,2015,1.379999e+08,1.392446e+09
1,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,120,Action|Adventure|Science Fiction|Thriller,5/13/15,6185,7.1,2015,1.379999e+08,3.481613e+08
2,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,119,Adventure|Science Fiction|Thriller,3/18/15,2480,6.3,2015,1.012000e+08,2.716190e+08
3,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,136,Action|Adventure|Science Fiction|Fantasy,12/15/15,5292,7.5,2015,1.839999e+08,1.902723e+09
4,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,137,Action|Crime|Thriller,4/1/15,2947,7.3,2015,1.747999e+08,1.385749e+09
5,9.110700,135000000,532950503,The Revenant,Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...,Alejandro GonzÃ¡lez IÃ±Ã¡rritu,156,Western|Drama|Adventure|Thriller,12/25/15,3929,7.2,2015,1.241999e+08,4.903142e+08
6,8.654359,155000000,440603537,Terminator Genisys,Arnold Schwarzenegger|Jason Clarke|Emilia Clar...,Alan Taylor,125,Science Fiction|Action|Thriller|Adventure,6/23/15,2598,5.8,2015,1.425999e+08,4.053551e+08
7,7.667400,108000000,595380321,The Martian,Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ...,Ridley Scott,141,Drama|Adventure|Science Fiction,9/30/15,4572,7.6,2015,9.935996e+07,5.477497e+08
8,7.404165,74000000,1156730962,Minions,Sandra Bullock|Jon Hamm|Michael Keaton|Allison...,Kyle Balda|Pierre Coffin,91,Family|Animation|Adventure|Comedy,6/17/15,2893,6.5,2015,6.807997e+07,1.064192e+09
9,6.326804,175000000,853708609,Inside Out,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,Pete Docter,94,Comedy|Animation|Family,6/9/15,3935,8.0,2015,1.609999e+08,7.854116e+08


In [368]:
def extract_actor(t, i):
    print(t, i)
    try:
        return t.split('|')[i]
    except IndexError:
        return ''
    
df_clean = df_clean.assign(actor1=extract_actor(df_clean.cast, 0))

0        Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...
1        Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...
2        Shailene Woodley|Theo James|Kate Winslet|Ansel...
3        Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...
4        Vin Diesel|Paul Walker|Jason Statham|Michelle ...
5        Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...
6        Arnold Schwarzenegger|Jason Clarke|Emilia Clar...
7        Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ...
8        Sandra Bullock|Jon Hamm|Michael Keaton|Allison...
9        Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...
10       Daniel Craig|Christoph Waltz|LÃ©a Seydoux|Ralp...
11       Mila Kunis|Channing Tatum|Sean Bean|Eddie Redm...
12       Domhnall Gleeson|Alicia Vikander|Oscar Isaac|S...
13       Adam Sandler|Michelle Monaghan|Peter Dinklage|...
14       Robert Downey Jr.|Chris Hemsworth|Mark Ruffalo...
15       Samuel L. Jackson|Kurt Russell|Jennifer Jason ...
16       Liam Neeson|Forest Whitaker|Maggie Grace|Famke.

AttributeError: 'Series' object has no attribute 'split'

In [374]:
df_clean['cast'].isna().any()

False