# Transform Load

Combine and preprocess data from multiple sources to make analysis on movie profits.

In [1]:
# Libraries used
import pandas as pd
import re
import utilities as utl

# t_profits

In [2]:
# Read movie profits data
profits = pd.read_csv('staging/profits.csv', header=None)

# Set column and index names
profits.columns = ['box_office', 'budget', 'year']
profits.index = range(1,251)
profits.index.name = 'movie_id'

In [3]:
# Read movie titles data
titles = pd.read_csv('staging/movies.csv', header=None, usecols=[0])

# Set column and index names
titles.columns = ['title']
titles.index = range(1,251)
titles.index.name = 'movie_id'

In [4]:
# Concatenate titles and profits data into a Data Frame called 'movies'
movies = pd.concat([titles, profits], axis=1)
movies.head(3)

Unnamed: 0_level_0,title,box_office,budget,year
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,The Shawshank Redemption,"$28,884,504","$25,000,000",1994
2,The Godfather,"$250,341,816","$6,000,000",1972
3,The Dark Knight,"$1,006,234,167","$185,000,000",2008


## Handling Missing Values

We can use the method .isna() to create a boolean mask and take a look at the rows with missing values 

In [5]:
# Missing values
missing = movies[movies.isna().any(axis=1)]
print(f'\nThere are {len(missing)} movies with missing data regarding their budget or box office revenue.\n\n{missing}')


There are 24 movies with missing data regarding their budget or box office revenue.

                               title   box_office       budget  year
movie_id                                                            
45                          Harakiri      $15,222          NaN  1962
86                      High and Low      $46,808          NaN  1963
92                      Come and See  $20,929,648          NaN  1985
96                                 M      $35,566          NaN  1931
99                             Ikiru      $96,302          NaN  1952
107                         Hamilton          NaN          NaN  2020
122              Like Stars on Earth  $21,897,373          NaN  2007
145                          Yojimbo      $46,808          NaN  1961
179                            Klaus          NaN  $40,000,000  2019
183                On the Waterfront          NaN     $910,000  1954
187                Wild Strawberries      $60,418          NaN  1957
188              

## Dropping Missing Values

We can see they are mostly old movies. ***For the sake of simplicity, we are gonna omit these films in our analysis.***

In [6]:
# Drop missing values
movies = movies[~movies.isna().any(axis=1)]

# Transform

In [7]:
# Get rid of commas ',' in the box office and budget fields
movies['box_office'] = pd.Series(map(lambda x: x.replace(',',''), movies['box_office']), index=movies.index)
movies['budget'] = pd.Series(map(lambda x: x.replace(',',''), movies['budget']), index=movies.index)

## Handling box office revenues

Notice that all box office revenues are in USD.

In [8]:
# Get rid of '$' symbols and store data in a new column called 'box_office_usd'
box_office_usd = pd.Series(map(lambda s: int(re.match('([^0-9]+)([0-9]+)', s).groups()[1]), movies['box_office']), index=movies.index)
movies['box_office_usd'] =  box_office_usd

## Handling budgets

In [9]:
# Extract currency symbols and amounts from the budget column
budget_currency = pd.Series(map(lambda s: re.match('([^0-9]+)([0-9]+)', s).groups()[0], movies['budget']), index=movies.index)
budget_amount = pd.Series(map(lambda s: re.match('([^0-9]+)([0-9]+)', s).groups()[1], movies['budget']), index=movies.index)

# Store data in two new columns
movies['budget_currency'] = budget_currency
movies['budget_amount'] = budget_amount

## Handling foreign currencies in budgets

In [10]:
# Keep track of rows with budgets in foreign currencies
foreign_currencies = budget_currency[budget_currency != '$']

print(f'\nThere are {len(foreign_currencies)} budgets in one of the following foreign currencies:\n{foreign_currencies.unique()}\n')
movies[['title','year','budget_currency', 'budget_amount']].loc[foreign_currencies.index]


There are 18 budgets in one of the following foreign currencies:
['¥' 'R$' 'FRF' '€' 'DEM' '₹' '£' 'A$' '₩']



Unnamed: 0_level_0,title,year,budget_currency,budget_amount
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20,Seven Samurai,1954,¥,125000000
24,City of God,2002,R$,3300000
35,Léon: The Professional,1994,FRF,115000000
46,The Intouchables,2011,€,9500000
77,Das Boot,1981,DEM,32000000
79,Princess Mononoke,1997,¥,2400000000
82,Your Name.,2016,¥,370000000
84,3 Idiots,2009,₹,550000000
114,Metropolis,1927,DEM,6000000
118,Snatch,2000,£,6000000


In [11]:
# Replace the budget_currency column values with their ISO 4217 equivalent
movies['budget_currency'] = movies['budget_currency'].replace(utl.iso_standard)

### Convert budgets in foreign currencies to USD

In [12]:
# Convert budget amounts to USD using our custom function "to_usd()"
lst = [utl.to_usd(row[0], row[1], int(row[2])) for i, row in movies[['budget_currency', 'year', 'budget_amount']].iterrows()]

# Store data in a new column called 'budget_usd'
movies['budget_usd'] = pd.DataFrame(lst, index=movies.index)

In [13]:
# Take a look at the converted values of budgets in foreign currencies
movies[['title', 'budget', 'budget_currency', 'year', 'budget_usd']].loc[foreign_currencies.index]

Unnamed: 0_level_0,title,budget,budget_currency,year,budget_usd
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20,Seven Samurai,¥125000000,JPY,1954,347222
24,City of God,R$3300000,BRL,2002,909150
35,Léon: The Professional,FRF115000000,FRF,1994,21337786
46,The Intouchables,€9500000,EUR,2011,12255000
77,Das Boot,DEM32000000,DEM,1981,16202532
79,Princess Mononoke,¥2400000000,JPY,1997,18397854
82,Your Name.,¥370000000,JPY,2016,3163475
84,3 Idiots,₹550000000,INR,2009,11362392
114,Metropolis,DEM6000000,DEM,1927,1428571
118,Snatch,£6000000,GBP,2000,9000000


## Adjust for Inflation
We can add an extra column with an "inflation factor" to convert amounts to 2023 constant prices in dollars, that would allow us to compare historic revenues more fairly.  

Data is obtained from https://www.officialdata.org/us/inflation/1888?amount=1

In [14]:
# Load inflation data from years 1888 to 2023. 
# 1888 is the year when the first motion picture was recorded!
inflation = pd.read_csv('other/inflation_data.csv', usecols = [0,2])

In [15]:
inflation

Unnamed: 0,year,inflation rate
0,1888,0.00
1,1889,-0.03
2,1890,-0.01
3,1891,0.00
4,1892,0.00
...,...,...
131,2019,0.02
132,2020,0.01
133,2021,0.05
134,2022,0.08


# Calculate inflation factors
In this context, ***inflation factors*** are the factors that equalize the purchasing power of a currency by eliminating the differences in price levels between years. All amounts will be converted to 2023 prices for a better comparison of historical data.

In [16]:
# Compute "inflation factors" to convert to 2023 prices
lst = [1]
for i in range(1,136):
    lst += [lst[-1+i] * (1 + inflation.iloc[-i][1])]

# Store data into a new column called 'inflation_factor_2023'
inflation['inflation_factor_2023'] = pd.DataFrame(lst[::-1]).round(6)

In [17]:
# Merge inflation data with movie data
movies = movies.reset_index().merge(inflation, how='inner', on='year')

# Preserve order given by movie_id
movies = movies.sort_values(by=['movie_id'])

# Restore movie_id index
movies = movies.set_index('movie_id')

In [18]:
# Display final result
profits = movies[['title', 'year', 'box_office_usd', 'budget_usd', 'inflation_factor_2023']]
profits

Unnamed: 0_level_0,title,year,box_office_usd,budget_usd,inflation_factor_2023
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,The Shawshank Redemption,1994,28884504,25000000,2.030414
2,The Godfather,1972,250341816,6000000,7.183692
3,The Dark Knight,2008,1006234167,185000000,1.395895
4,The Godfather Part II,1974,47961919,13000000,6.105467
5,12 Angry Men,1957,955,350000,10.643040
...,...,...,...,...,...
246,The Iron Giant,1999,23335817,70000000,1.803473
247,The Help,2011,216639112,25000000,1.328664
248,Dersu Uzala,1975,14480,4000000,5.601346
249,Aladdin,1992,504050219,28000000,2.154066


In [19]:
profits.to_parquet('master/t_profits.parquet', index=True)