In [6]:
import pandas

def parse_file(fileName):
    movies_df = pandas.read_csv(fileName)
    return movies_df.to_dict('records')

movies = parse_file('https://raw.githubusercontent.com/fivethirtyeight/data/master/bechdel/movies.csv')

In [7]:
type(movies) # list

list

In [8]:
len(movies) # 1794

1794

In [9]:
movies[0]

# budget_2013$ is the budget adjusted for inflation in 2013 dollars
# domgross_2013$ is the domestic revenue adjusted for inflation in 2013 dollars
# intgross_2013$ is the international revenue adjusted for inflation in 2013 dollars

{'year': 2013,
 'imdb': 'tt1711425',
 'title': '21 &amp; Over',
 'test': 'notalk',
 'clean_test': 'notalk',
 'binary': 'FAIL',
 'budget': 13000000,
 'domgross': 25682380.0,
 'intgross': 42195766.0,
 'code': '2013FAIL',
 'budget_2013$': 13000000,
 'domgross_2013$': 25682380.0,
 'intgross_2013$': 42195766.0,
 'period code': 1.0,
 'decade code': 1.0}

In [37]:
#Let's remove the movies whose domgross_2013 points to values of nan, which stands for "not a number". This data is missing. 

import math

def remove_movies_missing_data(movies):
    nan_movies = list(filter(lambda movie: math.isnan(movie['domgross_2013$']), movies))
    parsed_movies = list(filter(lambda x: x not in nan_movies, movies))
    return parsed_movies

# https://stackoverflow.com/questions/2514961/remove-all-values-within-one-list-from-another-list/30353802 

In [38]:
parsed_movies = remove_movies_missing_data(movies)

In [39]:
len(parsed_movies)

1776

In [40]:
# we can check to see that no movies with a domgross_2013 value of nan are included.

list(filter(lambda movie: math.isnan(movie['domgross_2013$']),parsed_movies)) # []

[]

In [43]:
# Problem: Currently, our data has some very large numbers. (Hard to deal with/read) e.g.:

movies[0]['budget']

13000000

In [92]:
# To make things simpler, let's divide both our budget and revenue numbers for each movie by 1 million.
# do: budget, budget_2013$, domgross, domgross_2013$, intgross, and intgross_2013$ numbers all divided by 1 million and rounded to two decimal places.

def scale_down_movie(movie):
    selected_keys = ('domgross_2013$', 'budget', 'budget_2013$', 'domgross', 'domgross_2013$', 'intgross', 'intgross_2013$')
    for key in selected_keys: 
        movie[key] = round(movie[key]/1000000, 2)   
    return movie

In [93]:
scale_down_movie(parsed_movies[9])

#  {'binary': 'FAIL',
#   'budget': 130.0,
#   'budget_2013$': 130.0,
#   'clean_test': 'notalk',
#   'code': '2013FAIL',
#   'decade code': 1.0,
#   'domgross': 60.52,
#   'domgross_2013$': 60.52,
#   'imdb': 'tt1815862',
#   'intgross': 244.37,
#   'intgross_2013$': 244.37,
#   'period code': 1.0,
#   'test': 'notalk',
#   'title': 'After Earth',
#   'year': 2013}

{'year': 2013,
 'imdb': 'tt1815862',
 'title': 'After Earth',
 'test': 'notalk',
 'clean_test': 'notalk',
 'binary': 'FAIL',
 'budget': 0.0,
 'domgross': 0.0,
 'intgross': 0.0,
 'code': '2013FAIL',
 'budget_2013$': 0.0,
 'domgross_2013$': 0.0,
 'intgross_2013$': 0.0,
 'period code': 1.0,
 'decade code': 1.0}

In [94]:
scale_down_movie(parsed_movies[8])

#  {'binary': 'PASS', 'budget': 13.0,
#   'budget_2013$': 13.0,
#   'clean_test': 'ok',
#   'code': '2013PASS',
#   'decade code': 1.0,
#   'domgross': 18.01,
#   'domgross_2013$': 18.01,
#   'imdb': 'tt1814621',
#   'intgross': 18.01,
#   'intgross_2013$': 18.01,
#   'period code': 1.0,
#   'test': 'ok',
#   'title': 'Admission',
#   'year': 2013}

{'year': 2013,
 'imdb': 'tt1814621',
 'title': 'Admission',
 'test': 'ok',
 'clean_test': 'ok',
 'binary': 'PASS',
 'budget': 0.0,
 'domgross': 0.0,
 'intgross': 0.0,
 'code': '2013PASS',
 'budget_2013$': 0.0,
 'domgross_2013$': 0.0,
 'intgross_2013$': 0.0,
 'period code': 1.0,
 'decade code': 1.0}

In [86]:
# Ok, now that we have a function to scale down our movies, lets map through all of our parsed_movies to return a list of scaled_movies.

def scale_down_movies(movies):
    list(map(lambda movie : scale_down_movie(movie), movies))
    return movies

In [87]:
first_ten_movies = parsed_movies[0:10]
first_ten_scaled = scale_down_movies(first_ten_movies) or []
first_ten_scaled[-2:]
# [{'binary': 'PASS', 'budget': 13.0,
#   'budget_2013$': 13.0,
#   'clean_test': 'ok',
#   'code': '2013PASS',
#   'decade code': 1.0,
#   'domgross': 18.01,
#   'domgross_2013$': 18.01,
#   'imdb': 'tt1814621',
#   'intgross': 18.01,
#   'intgross_2013$': 18.01,
#   'period code': 1.0,
#   'test': 'ok',
#   'title': 'Admission',
#   'year': 2013},
#  {'binary': 'FAIL',
#   'budget': 130.0,
#   'budget_2013$': 130.0,
#   'clean_test': 'notalk',
#   'code': '2013FAIL',
#   'decade code': 1.0,
#   'domgross': 60.52,
#   'domgross_2013$': 60.52,
#   'imdb': 'tt1815862',
#   'intgross': 244.37,
#   'intgross_2013$': 244.37,
#   'period code': 1.0,
#   'test': 'notalk',
#   'title': 'After Earth',
#   'year': 2013}]

[{'year': 2013,
  'imdb': 'tt1814621',
  'title': 'Admission',
  'test': 'ok',
  'clean_test': 'ok',
  'binary': 'PASS',
  'budget': 0.0,
  'domgross': 0.0,
  'intgross': 0.0,
  'code': '2013PASS',
  'budget_2013$': 0.0,
  'domgross_2013$': 0.0,
  'intgross_2013$': 0.0,
  'period code': 1.0,
  'decade code': 1.0},
 {'year': 2013,
  'imdb': 'tt1815862',
  'title': 'After Earth',
  'test': 'notalk',
  'clean_test': 'notalk',
  'binary': 'FAIL',
  'budget': 0.0,
  'domgross': 0.0,
  'intgross': 0.0,
  'code': '2013FAIL',
  'budget_2013$': 0.0,
  'domgross_2013$': 0.0,
  'intgross_2013$': 0.0,
  'period code': 1.0,
  'decade code': 1.0}]