In [1]:
import numpy as np
import pandas as pd
import datetime
import re
import csv
import json
import os

In [2]:
# Read in "The Movies dataset"
# if this is read in as csv, each row is a string and dictionaries are not recognized.
df=pd.read_csv("Resources/movies_metadata.csv", low_memory=False)
print(df.describe())
print(df.columns)
print(df["release_date"].dtypes)

            revenue       runtime  vote_average    vote_count
count  4.546000e+04  45203.000000  45460.000000  45460.000000
mean   1.120935e+07     94.128199      5.618207    109.897338
std    6.433225e+07     38.407810      1.924216    491.310374
min    0.000000e+00      0.000000      0.000000      0.000000
25%    0.000000e+00     85.000000      5.000000      3.000000
50%    0.000000e+00     95.000000      6.000000     10.000000
75%    0.000000e+00    107.000000      6.800000     34.000000
max    2.787965e+09   1256.000000     10.000000  14075.000000
Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')
object


In [141]:
# clean up and change release_date to datetime
# save at each change to make reverting easy
r_date=df["release_date"]
r_date.str.strip()  #remove spaces
form1=r'(^[0-9]{4})'
rel_year = r_date.str.extract(f'({form1})')  #extract just year
rel_year["release_year"]=pd.to_numeric(rel_year[0])
print(rel_year.dtypes)
rel_year.drop([0,1],axis=1,inplace=True)
print(f'null rows: {rel_year.isnull().sum()}')
rel_year.dropna(how="any", inplace=True)
print(f'number of movies: {len(rel_year)}')
print(rel_year.dtypes)

0                object
1                object
release_year    float64
dtype: object
null rows: release_year    90
dtype: int64
number of movies: 45376
release_year    float64
dtype: object


In [154]:
rel_year=rel_year[rel_year["release_year"] > 1989]
year_counts=rel_year["release_year"].value_counts()
year_counts

2014.0    1974
2015.0    1905
2013.0    1889
2012.0    1722
2011.0    1667
2016.0    1604
2009.0    1586
2010.0    1501
2008.0    1473
2007.0    1320
2006.0    1270
2005.0    1125
2004.0     992
2002.0     905
2003.0     882
2001.0     865
2000.0     789
1999.0     723
1998.0     722
1997.0     661
1996.0     633
1995.0     599
1994.0     544
2017.0     532
1993.0     489
1992.0     453
1990.0     427
1991.0     426
2018.0       5
2020.0       1
Name: release_year, dtype: int64

In [44]:
len(rel_year)

29684

In [119]:
# production_companies is a string that looks like a list of dictionaries
# filter rows to keep only those with 'iso_3166_1' = 'US'
pd_countries=df["production_countries"]  # pd_countries is a series
form1=r"'iso_3166_1':\s*('US')"
country = pd_countries.str.extract(f'({form1})')
country.drop([0],axis=1,inplace=True)
print(country.notnull().sum())
country.dropna(how="any", inplace=True)
len(country)


1    21153
dtype: int64


21153

In [155]:
movies_year_df = pd.concat([df, rel_year, country], axis=1, ignore_index=False, join="inner")
len(movies_year_df)

12906

In [156]:
movies_year_df.columns

Index([                'adult', 'belongs_to_collection',
                      'budget',                'genres',
                    'homepage',                    'id',
                     'imdb_id',     'original_language',
              'original_title',              'overview',
                  'popularity',           'poster_path',
        'production_companies',  'production_countries',
                'release_date',               'revenue',
                     'runtime',      'spoken_languages',
                      'status',               'tagline',
                       'title',                 'video',
                'vote_average',            'vote_count',
                'release_year',                       1],
      dtype='object')

In [157]:
movies_year_df["release_year"].value_counts()

2014.0    820
2015.0    788
2013.0    740
2016.0    707
2011.0    659
2012.0    658
2009.0    633
2008.0    622
2007.0    565
2010.0    547
2006.0    541
2005.0    472
2004.0    391
2002.0    383
1996.0    359
2001.0    357
2000.0    356
1998.0    355
2003.0    348
1999.0    335
1997.0    332
1995.0    320
2017.0    295
1994.0    292
1993.0    281
1990.0    256
1991.0    249
1992.0    242
2018.0      2
2020.0      1
Name: release_year, dtype: int64

In [None]:
# Save to csv
file_path ="Resources/segment1_input.csv"
movies_year_df.to_csv(file_path,index=False)
