In [1]:
import pandas as pd
import json
import numpy as np
import re

In [2]:
# Inport the CSV files from disk
movies_metadata_path = '../../Boot_Camp_files/Module_8/movies_metadata.csv'
movies_metadata_df = pd.read_csv(movies_metadata_path, low_memory=False)
ratings_path = '../../Boot_Camp_files/Module_8/ratings.csv'
ratings_df = pd.read_csv(ratings_path)
ratings_df.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [3]:
with open('wikipedia-movies (1).json',mode = 'r') as file:
    wiki_movies_raw = json.load(file)

In [4]:
len(wiki_movies_raw)

7311

In [5]:
# Convert the list of dictionaries to a df to inspect
wiki_movies_raw_df = pd.DataFrame(wiki_movies_raw)

In [6]:
wiki_movies_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7311 entries, 0 to 7310
Columns: 193 entries, url to Polish
dtypes: float64(1), object(192)
memory usage: 10.8+ MB


In [7]:
print(sorted(wiki_movies_raw_df.columns.tolist()))

['Actor control', 'Adaptation by', 'Alias', 'Alma mater', 'Also known as', 'Animation by', 'Arabic', 'Area', 'Area served', 'Artist(s)', 'Attraction type', 'Audio format', 'Author', 'Based on', 'Biographical data', 'Bopomofo', 'Born', 'Box office', 'Budget', 'Camera setup', 'Cantonese', 'Characters', 'Children', 'Chinese', 'Cinematography', 'Closing date', 'Color process', 'Comics', 'Composer(s)', 'Coordinates', 'Country', 'Country of origin', 'Cover artist', 'Created by', 'Date premiered', 'Designer(s)', 'Developed by', 'Developer(s)', 'Dewey Decimal', 'Died', 'Directed by', 'Director', 'Distributed by', 'Distributor', 'Divisions', 'Duration', 'Edited by', 'Editor(s)', 'Ending theme', 'Engine', 'Engine(s)', 'Executive producer(s)', 'Family', 'Fate', 'Film(s)', 'Followed by', 'Format(s)', 'Formerly', 'Founded', 'Founder', 'Founders', 'French', 'Full name', 'Gender', 'Genre', 'Genre(s)', 'Genres', 'Gwoyeu Romatzyh', 'Hangul', 'Hanyu Pinyin', 'Headquarters', 'Hebrew', 'Height', 'Hepburn'

Use list comprehension to cut the raw movies read to only those that have a director (either 'Director' or 'Directed by'), have a url for an imdb entry, and do not have any values under "No. of episodes"

In [8]:

wiki_movies = [movie for movie in wiki_movies_raw 
               if ('Director' in movie or 'Directed by' in movie) and 
               'imdb_link' in movie and
              'No. of episodes' not in movie]

In [9]:
len(wiki_movies)

7076

I identified column names in the wiki_movies list of dictionaries that are alternate names for movies.  THe function is passed a dictionary for a single movie.  If one of the identified column names has data for a dictionary then that value is saved to a separate dictionary and the value is removed from the movie database.  If there was an alternate title then a new key value pair is added to the movie.

Going back to add a sub function to the clean_movie function.  For columns that have the same content but slightly different names the column names are changed to get rid of the excess columns.

I use a dictionary of the columns to be removed and the column name for the change.

In [10]:
def clean_movie(movie):
    local_movie = dict(movie)
    alt_titles = {}
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune–Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in local_movie:
            alt_titles[key] = local_movie[key]
            local_movie.pop(key)
    if len(alt_titles) > 0:
        local_movie['alt_titles'] = alt_titles

#define embedded function to remove a column that is equivalent to another column and pop the value to the remaining column
    def change_column_name (old_key,new_key):
        if old_key in local_movie:
            local_movie[new_key] = local_movie.pop(old_key)
# Dictionary of repeated (nearly) column names and what they should be changed to
    old_key_dict = {'Directed by':'Director',
                   'Country of origin':'Country',
                   'Distributed by':'Distributor',
                   'Edited by':'Editor(s)',
                   'Produced by':'Producer',
                    'Producer(s)':'Producer',
                    'Production company(s)':'Productioncompany',
                    'Productioncompanies ':'Productioncompany',
                    'Released':'Release date',
                    'Running time':'Length',
                    'Screen story by':'Writer(s)',
                    'Screenplay by':'Writer(s)',
                   'Release Date':'Release date,',
                   'Story by':'Writer(s)',
                   'Theme music composer':'Composer(s)',
                   'Written by':'Writer(s)'}
    
    for key,value in old_key_dict.items():
       change_column_name(key,value)
                    
    return local_movie

In [11]:
clean_movies = [clean_movie(movie) for movie in wiki_movies]

In [12]:
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

['Adaptation by',
 'Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language',
 'Length',
 'Music by',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Original release',
 'Picture format',
 'Preceded by',
 'Producer',
 'Production location(s)',
 'Productioncompany',
 'Productioncompany ',
 'Recorded',
 'Release date',
 'Starring',
 'Suggested by',
 'Venue',
 'Voices of',
 'Writer(s)',
 'alt_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [13]:
wiki_movies_df['imdb_link'].head()

0    https://www.imdb.com/title/tt0098987/
1    https://www.imdb.com/title/tt0098994/
2    https://www.imdb.com/title/tt0099005/
3    https://www.imdb.com/title/tt0099012/
4    https://www.imdb.com/title/tt0099018/
Name: imdb_link, dtype: object

In [14]:
print(len(wiki_movies_df))
wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
wiki_movies_df.drop_duplicates(subset = 'imdb_id', inplace=True)
print(len(wiki_movies_df))
wiki_movies_df.head()


7076
7033


Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Narrated by,Music by,Cinematography,Productioncompany,...,Preceded by,Adaptation by,Suggested by,alt_titles,Recorded,Venue,Label,Animation by,Color process,imdb_id
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...","Andrew ""Dice"" Clay","[Cliff Eidelman, Yello]",Oliver Wood,Silver Pictures,...,,,,,,,,,,tt0098987
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",,Maurice Jarre,Mark Plummer,Avenue Pictures,...,,,,,,,,,,tt0098994
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",,Charles Gross,Roger Deakins,"[Carolco Pictures, IndieProd Company]",...,,,,,,,,,,tt0099005
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",,,Carlo Di Palma,,...,,,,,,,,,,tt0099012
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",,Maurice Jarre,Russell Boyd,,...,,,,,,,,,,tt0099018


Use list comprehension to identify the columns that have more than 90% of the rows with non null values.

In [15]:
columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() 
                   < 0.9 * len(wiki_movies_df)]
wiki_movies_df = wiki_movies_df[columns_to_keep]
wiki_movies_df.head()

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Music by,Cinematography,Productioncompany,Release date,...,Budget,Box office,Director,Distributor,Editor(s),Producer,Length,Writer(s),Productioncompany.1,imdb_id
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...","[Cliff Eidelman, Yello]",Oliver Wood,Silver Pictures,"[July 11, 1990, (, 1990-07-11, )]",...,$20 million,$21.4 million,Renny Harlin,20th Century Fox,Michael Tronick,"[Steve Perry, Joel Silver]",102 minutes,"[David Arnott, James Cappe]",,tt0098987
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Maurice Jarre,Mark Plummer,Avenue Pictures,"[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",...,$6 million,$2.7 million,James Foley,Avenue Pictures,Howard E. Smith,"[Ric Kidney, Robert Redlin]",114 minutes,"[James Foley, Robert Redlin]",,tt0098994
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Charles Gross,Roger Deakins,"[Carolco Pictures, IndieProd Company]","[August 10, 1990, (, 1990-08-10, )]",...,$35 million,"$57,718,089",Roger Spottiswoode,TriStar Pictures,"[John Bloom, Lois Freeman-Fox]",Daniel Melnick,113 minutes,"[John Eskow, Richard Rush]",,tt0099005
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",,Carlo Di Palma,,"[December 25, 1990, (, 1990-12-25, )]",...,$12 million,"$7,331,647",Woody Allen,Orion Pictures,Susan E. Morse,Robert Greenhut,106 minutes,Woody Allen,,tt0099012
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Maurice Jarre,Russell Boyd,,"December 19, 1990",...,$25 million,"$6,939,946 (USA)",John Cornell,Paramount Pictures,David Stiven,John Cornell,95 minutes,Paul Hogan,,tt0099018


In [16]:
wiki_movies_df.dtypes

url                   object
year                   int64
imdb_link             object
title                 object
Based on              object
Starring              object
Music by              object
Cinematography        object
Productioncompany     object
Release date          object
Country               object
Language              object
Budget                object
Box office            object
Director              object
Distributor           object
Editor(s)             object
Producer              object
Length                object
Writer(s)             object
Productioncompany     object
imdb_id               object
dtype: object

In [17]:
box_office = wiki_movies_df['Box office'].dropna()

In [18]:
def is_not_a_string(x):
    return type(x) != str


In [19]:
box_office[box_office.map(lambda x: type(x) != str)]

34                           [US$, 4,212,828]
54      [$6,698,361 (, United States, ), [2]]
74                    [$6,488,144, (US), [1]]
126                [US$1,531,489, (domestic)]
130                          [US$, 4,803,039]
                        ...                  
6980               [$99.6, million, [4], [5]]
6994                   [$365.6, million, [1]]
6995                         [$53.8, million]
7015                     [$435, million, [7]]
7048                   [$529.3, million, [4]]
Name: Box office, Length: 135, dtype: object

In [20]:
box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)

In [21]:
box_office.head(50)

0                        $21.4 million
1                         $2.7 million
2                          $57,718,089
3                           $7,331,647
4                     $6,939,946 (USA)
9                             $855,810
10                      $195.3 million
11                       $53.2 million
12                       $15.7 million
13                       $52.1 million
14                      $244.5 million
15                         $12,626,043
17                         $19,740,070
18                        $138,697,012
20                          $1,292,323
21                        $8.2 million
22                       $15.6 million
24                          $2,070,871
25       $27.6 million (United States)
27                             $10,173
29                          $5 million
33                       $35.8 million
34                       US$ 4,212,828
37                        $2.4 million
39                            $947,306
40                       

In [22]:
box_office = box_office.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)

In [23]:
form_one = r'\$\s*\d+\.?\d*\s*[mb]illion'

In [24]:
matches_form_one = box_office.str.contains(form_one, flags=re.IGNORECASE)

In [25]:
form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)'

In [26]:
matches_form_two = box_office.str.contains(form_two, flags=re.IGNORECASE)

In [27]:
box_office[(~ matches_form_one) & (~ matches_form_two)]

600                      $5000 (US)
1070                     35,254,617
1480                     £3 million
1865                   ¥1.1 billion
2032                            N/A
2091                           $309
2665    926,423 admissions (France)
3631                            TBA
3879       CN¥3.650 million (China)
4116                     £7,385,434
4306                            $30
4492                   $47.7 millon
4561        $45.2k (only in Turkey)
5447                          £2.56
5784                       413 733$
6013                        Unknown
6369                          $111k
6370                           $588
6593                 less than $372
6843                        8 crore
6904                    $6.9 millon
Name: Box office, dtype: object

In [28]:
def parse_dollars(s):
    # if s is not a string return NaN
    if type(s) != str:
        return np.nan
    # if input is of the form $###.# million
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on',s, flags=re.IGNORECASE):
        # remove dollar sign and work million
        s = re.sub(r'\$|\s|[a-zA-Z]','',s)
        # convert to float and multiply by 1000000
        value = float(s) * 1000000
        return value
        # return number
    # if input is of the form $xxx.x billion
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on',s, flags=re.IGNORECASE):
        # remove $ and billion
        s = re.sub(r'\$|\s|[a-zA-Z]','',s)
        # convert to float
        value = float(s) * 1000000000
        return value
        # multiply by 1000000000
        # return number
    # if of the form $xxx,xxx,xxx
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)',s,flags=re.IGNORECASE):
        # remove $ and commas
        s = re.sub(r'\$|\s|,','',s)
        # convert to float
        value = float(s)
        return value
        # return number
    # if of the form $$ xxx [x] [x] million
    elif re.match(r'\$\d{1,3}\s\[\d{1}\]\s\[?\d?\]?\s?mill?i?on',s,flags = re.IGNORECASE):
        # remove $ , spaces, and [x]
        s = re.sub(r'\$|\[\d\]|\s|mill?i?on','',s)
        value = float(s) * 1000000
        return value
    # else return NaN
    else:
        return np.nan

In [29]:
wiki_movies_df['box office'] = box_office.str.extract(f'({form_one}|{form_two})',flags=re.IGNORECASE)[0].apply(parse_dollars)

In [30]:
wiki_movies_df['box office']

0       21400000.0
1        2700000.0
2       57718089.0
3        7331647.0
4        6939946.0
           ...    
7071    41900000.0
7072    76100000.0
7073    38400000.0
7074     5500000.0
7075           NaN
Name: box office, Length: 7033, dtype: float64

In [31]:
budget = wiki_movies_df['Budget'].dropna()

In [32]:
budget = budget.map(lambda x:  ' '.join(x) if type(x) == list else x)

In [33]:
budget = budget.str.replace(r'\$.*[-—–](?![a-zA-Z])', '$', regex=True)

In [34]:
budget

0        $20 million
1         $6 million
2        $35 million
3        $12 million
4        $25 million
            ...     
7070    €4.3 million
7071     $42 million
7072     $60 million
7073     $20 million
7074      $9 million
Name: Budget, Length: 4738, dtype: object

In [35]:
form_three = r'\$\d{1,3}\s\[\d{1}\]\s\[?\d?\]?\s?mill?i?on'

In [36]:
matches_form_one = budget.str.contains(form_one,flags=re.IGNORECASE)

In [37]:
matches_form_two = budget.str.contains(form_two,flags=re.IGNORECASE)

In [38]:
matches_form_three = budget.str.contains(form_three,flags = re.IGNORECASE)

In [39]:
budget[(~ matches_form_one) & (~ matches_form_two) & (~ matches_form_three)]

136                         Unknown
204     60 million Norwegian Kroner
478                         Unknown
1226                        Unknown
1278                            HBO
1374                     £6,000,000
1397                     13 million
1480                   £2.8 million
1734                   CAD2,000,000
1913     PHP 85 million (estimated)
1948                    102,888,900
1953                   3,500,000 DM
1973                     ₤2,300,874
2281                     $14 milion
2451                     ₤6,350,000
3144                   € 40 million
3418                        $218.32
3802                   £4.2 million
3906                            N/A
3959                    760,000 USD
4470                       19 crore
4641                    £17 million
5424                            N/A
5447                     £4 million
5671                    €14 million
5687                   $ dead link]
6385               £ 12 million [3]
6593                     £3 

In [40]:
wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two}|{form_three})',
                                                      flags=re.IGNORECASE)[0].apply(parse_dollars)

In [41]:
# wiki_movies_df.drop('Budget',axis = 1,inplace=True)
wiki_movies_df.drop('Box office', axis=1, inplace=True)
wiki_movies_df.columns

Index(['url', 'year', 'imdb_link', 'title', 'Based on', 'Starring', 'Music by',
       'Cinematography', 'Productioncompany ', 'Release date', 'Country',
       'Language', 'Budget', 'Director', 'Distributor', 'Editor(s)',
       'Producer', 'Length', 'Writer(s)', 'Productioncompany', 'imdb_id',
       'box office', 'budget'],
      dtype='object')

In [42]:
release_date = wiki_movies_df['Release date'].dropna().map(lambda x:  ' '.join(x) if type(x) == list else x)

In [43]:
release_date

0                            July 11, 1990 ( 1990-07-11 )
1       May 17, 1990 ( 1990-05-17 ) (Cannes Film Marke...
2                          August 10, 1990 ( 1990-08-10 )
3                        December 25, 1990 ( 1990-12-25 )
4                                       December 19, 1990
                              ...                        
7071     December 25, 2018 ( 2018-12-25 ) (United States)
7072    December 11, 2018 ( 2018-12-11 ) ( Samuel Gold...
7073    November 8, 2018 ( 2018-11-08 ) ( AFI Fest ) D...
7074    August 31, 2018 ( 2018-08-31 ) ( Telluride ) D...
7075                      28 December 2018 ( 2018-12-28 )
Name: Release date, Length: 6842, dtype: object

In [44]:
date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]\d,\s\d{4}'
date_form_two = r'\d{4}.[01]\d.[123]\d'
date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
date_form_four = r'\d{4}'

In [45]:
release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})', flags=re.IGNORECASE)

Unnamed: 0,0
0,"July 11, 1990"
1,"May 17, 1990"
2,"August 10, 1990"
3,"December 25, 1990"
4,"December 19, 1990"
...,...
7071,"December 25, 2018"
7072,"December 11, 2018"
7073,2018
7074,"August 31, 2018"


In [46]:
wiki_movies_df['release_date'] = pd.to_datetime(release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0], 
                                                infer_datetime_format=True)

In [47]:
wiki_movies_df.drop('Release date',axis = 1, inplace=True)

In [48]:
wiki_movies_df.columns

Index(['url', 'year', 'imdb_link', 'title', 'Based on', 'Starring', 'Music by',
       'Cinematography', 'Productioncompany ', 'Country', 'Language', 'Budget',
       'Director', 'Distributor', 'Editor(s)', 'Producer', 'Length',
       'Writer(s)', 'Productioncompany', 'imdb_id', 'box office', 'budget',
       'release_date'],
      dtype='object')

In [49]:
running_time = wiki_movies_df['Length'].dropna().apply(lambda x:  ' '.join(x) if type(x)==list else x)

In [50]:
running_time

0                                 102 minutes
1                                 114 minutes
2                                 113 minutes
3                                 106 minutes
4                                  95 minutes
                        ...                  
7071                               90 minutes
7072                              132 minutes
7073                              120 minutes
7074                              123 minutes
7075    Variable; 90 minutes for default path
Name: Length, Length: 6894, dtype: object

In [51]:
running_time[running_time.str.contains(r'^\d*\s*m', flags=re.IGNORECASE) != True]

668                     UK:84 min (DVD version) US:86 min
727                         78-102 min (depending on cut)
840                       Varies (79 [3] –84 [1] minutes)
1347                                              25 : 03
1443    United States: 77 minutes Argentina: 94 minute...
1499                                            1hr 35min
1551                                               varies
1774                    Netherlands:96 min, Canada:95 min
1777                                       approx. 14 min
2273                                           1 h 43 min
2993                                               1h 48m
3925                                              4 hours
4425    US domestic version: 86 minutes Original versi...
4967    Theatrical cut: 97 minutes Unrated cut: 107 mi...
5424                    115 [1] /123 [2] /128 [3] minutes
5447                                    1 hour 32 minutes
7075                Variable; 90 minutes for default path
Name: Length, 

In [52]:
running_time_extract = running_time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')

In [53]:
running_time_extract

Unnamed: 0,0,1,2
0,,,102
1,,,114
2,,,113
3,,,106
4,,,95
...,...,...,...
7071,,,90
7072,,,132
7073,,,120
7074,,,123


In [54]:
running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)

In [55]:
running_time_extract

Unnamed: 0,0,1,2
0,0.0,0.0,102.0
1,0.0,0.0,114.0
2,0.0,0.0,113.0
3,0.0,0.0,106.0
4,0.0,0.0,95.0
...,...,...,...
7071,0.0,0.0,90.0
7072,0.0,0.0,132.0
7073,0.0,0.0,120.0
7074,0.0,0.0,123.0


In [56]:
wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[0]*60 + 
                                                            row[1] if row[2] == 0 else row[2], axis=1)

In [57]:
wiki_movies_df.drop('Length',axis=1, inplace=True)

In [58]:
wiki_movies_df.columns

Index(['url', 'year', 'imdb_link', 'title', 'Based on', 'Starring', 'Music by',
       'Cinematography', 'Productioncompany ', 'Country', 'Language', 'Budget',
       'Director', 'Distributor', 'Editor(s)', 'Producer', 'Writer(s)',
       'Productioncompany', 'imdb_id', 'box office', 'budget', 'release_date',
       'running_time'],
      dtype='object')

In [59]:
wiki_movies_df.to_csv('output/wiki_movies_df.csv')