In [1]:
import os
import pandas as pd

In [16]:
cwd = os.getcwd()
BASE_DIR = os.path.dirname(cwd)
DATA_DIR = os.path.join(BASE_DIR, 'data')
CACHE_DIR = os.path.join(BASE_DIR, 'cache')
working_file = os.path.join(CACHE_DIR, 'box-office-ranking-dataset.csv')
output_file = os.path.join(CACHE_DIR, 'box-office-ranking-cleaned-dataset.csv')

In [3]:
df = pd.read_csv(working_file)

In [4]:
df.head(n=999)

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1,filename,year
0,1,Toy Story 3,"$1,066,969,703","$415,004,880",38.9%,"$651,964,823",61.1%,2010_boxofficemojo_rankings.csv,2010
1,2,Alice in Wonderland,"$1,025,467,110","$334,191,110",32.6%,"$691,276,000",67.4%,2010_boxofficemojo_rankings.csv,2010
2,3,Harry Potter and the Deathly Hallows: Part 1,"$976,536,918","$295,983,305",30.3%,"$680,553,613",69.7%,2010_boxofficemojo_rankings.csv,2010
3,4,Inception,"$826,137,188","$292,576,195",35.4%,"$533,560,993",64.6%,2010_boxofficemojo_rankings.csv,2010
4,5,Shrek Forever After,"$752,600,867","$238,736,787",31.7%,"$513,864,080",68.3%,2010_boxofficemojo_rankings.csv,2010
...,...,...,...,...,...,...,...,...,...
994,410,Zama,"$489,692","$200,600",41%,"$289,092",59%,2017_boxofficemojo_rankings.csv,2017
995,411,Beach Rats,"$486,623","$473,771",97.4%,"$12,852",2.6%,2017_boxofficemojo_rankings.csv,2017
996,412,The Journey,"$482,209","$155,475",32.2%,"$326,734",67.8%,2017_boxofficemojo_rankings.csv,2017
997,413,DCI 2017 Tour Premeire,"$467,817","$467,817",100%,-,-,2017_boxofficemojo_rankings.csv,2017


In [5]:
df['Rank'] = -1
df.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1,filename,year
0,-1,Toy Story 3,"$1,066,969,703","$415,004,880",38.9%,"$651,964,823",61.1%,2010_boxofficemojo_rankings.csv,2010
1,-1,Alice in Wonderland,"$1,025,467,110","$334,191,110",32.6%,"$691,276,000",67.4%,2010_boxofficemojo_rankings.csv,2010
2,-1,Harry Potter and the Deathly Hallows: Part 1,"$976,536,918","$295,983,305",30.3%,"$680,553,613",69.7%,2010_boxofficemojo_rankings.csv,2010
3,-1,Inception,"$826,137,188","$292,576,195",35.4%,"$533,560,993",64.6%,2010_boxofficemojo_rankings.csv,2010
4,-1,Shrek Forever After,"$752,600,867","$238,736,787",31.7%,"$513,864,080",68.3%,2010_boxofficemojo_rankings.csv,2010


In [6]:
df['Domestic %'] = df['%']
df['Foreign %'] = df['%.1']
df.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1,filename,year,Domestic %,Foreign %
0,-1,Toy Story 3,"$1,066,969,703","$415,004,880",38.9%,"$651,964,823",61.1%,2010_boxofficemojo_rankings.csv,2010,38.9%,61.1%
1,-1,Alice in Wonderland,"$1,025,467,110","$334,191,110",32.6%,"$691,276,000",67.4%,2010_boxofficemojo_rankings.csv,2010,32.6%,67.4%
2,-1,Harry Potter and the Deathly Hallows: Part 1,"$976,536,918","$295,983,305",30.3%,"$680,553,613",69.7%,2010_boxofficemojo_rankings.csv,2010,30.3%,69.7%
3,-1,Inception,"$826,137,188","$292,576,195",35.4%,"$533,560,993",64.6%,2010_boxofficemojo_rankings.csv,2010,35.4%,64.6%
4,-1,Shrek Forever After,"$752,600,867","$238,736,787",31.7%,"$513,864,080",68.3%,2010_boxofficemojo_rankings.csv,2010,31.7%,68.3%


In [7]:
df.drop(columns=['%', '%.1'], inplace=True)
df.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,filename,year,Domestic %,Foreign %
0,-1,Toy Story 3,"$1,066,969,703","$415,004,880","$651,964,823",2010_boxofficemojo_rankings.csv,2010,38.9%,61.1%
1,-1,Alice in Wonderland,"$1,025,467,110","$334,191,110","$691,276,000",2010_boxofficemojo_rankings.csv,2010,32.6%,67.4%
2,-1,Harry Potter and the Deathly Hallows: Part 1,"$976,536,918","$295,983,305","$680,553,613",2010_boxofficemojo_rankings.csv,2010,30.3%,69.7%
3,-1,Inception,"$826,137,188","$292,576,195","$533,560,993",2010_boxofficemojo_rankings.csv,2010,35.4%,64.6%
4,-1,Shrek Forever After,"$752,600,867","$238,736,787","$513,864,080",2010_boxofficemojo_rankings.csv,2010,31.7%,68.3%


In [8]:
to_clean_cols = ['Worldwide', 'Domestic', 'Foreign']


def currency_str_to_int(current_value):
    currency_value = current_value.replace("$", "").replace(",", "")
    try:
        currency_value = int(currency_value)
    except:
        currency_value = 0
    return currency_value


def clean_col(row):
    for col in to_clean_cols:
        current_value = row[col]
        row[col] = currency_str_to_int(current_value)
    return(row)


df_cleaned = df.apply(clean_col, axis=1)
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,filename,year,Domestic %,Foreign %
0,-1,Toy Story 3,1066969703,415004880,651964823,2010_boxofficemojo_rankings.csv,2010,38.9%,61.1%
1,-1,Alice in Wonderland,1025467110,334191110,691276000,2010_boxofficemojo_rankings.csv,2010,32.6%,67.4%
2,-1,Harry Potter and the Deathly Hallows: Part 1,976536918,295983305,680553613,2010_boxofficemojo_rankings.csv,2010,30.3%,69.7%
3,-1,Inception,826137188,292576195,533560993,2010_boxofficemojo_rankings.csv,2010,35.4%,64.6%
4,-1,Shrek Forever After,752600867,238736787,513864080,2010_boxofficemojo_rankings.csv,2010,31.7%,68.3%


In [9]:
df_cleaned.dtypes

Rank              int64
Release Group    object
Worldwide         int64
Domestic          int64
Foreign           int64
filename         object
year              int64
Domestic %       object
Foreign %        object
dtype: object

In [10]:
df_cleaned.sort_values(by=['Worldwide'], inplace=True, ascending=False)
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,filename,year,Domestic %,Foreign %
5746,-1,Avengers: Endgame,2797800564,858373000,1939427564,2019_boxofficemojo_rankings.csv,2019,30.7%,69.3%
6522,-1,Star Wars: Episode VII - The Force Awakens,2068223624,936662225,1131561399,2015_boxofficemojo_rankings.csv,2015,45.3%,54.7%
2724,-1,Avengers: Infinity War,2048359754,678815482,1369544272,2018_boxofficemojo_rankings.csv,2018,33.1%,66.9%
6523,-1,Jurassic World,1670400637,652270625,1018130012,2015_boxofficemojo_rankings.csv,2015,39%,61%
5747,-1,The Lion King,1656943394,543638043,1113305351,2019_boxofficemojo_rankings.csv,2019,32.8%,67.2%


In [11]:
df_cleaned.reset_index(inplace=True, drop=True)
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,filename,year,Domestic %,Foreign %
0,-1,Avengers: Endgame,2797800564,858373000,1939427564,2019_boxofficemojo_rankings.csv,2019,30.7%,69.3%
1,-1,Star Wars: Episode VII - The Force Awakens,2068223624,936662225,1131561399,2015_boxofficemojo_rankings.csv,2015,45.3%,54.7%
2,-1,Avengers: Infinity War,2048359754,678815482,1369544272,2018_boxofficemojo_rankings.csv,2018,33.1%,66.9%
3,-1,Jurassic World,1670400637,652270625,1018130012,2015_boxofficemojo_rankings.csv,2015,39%,61%
4,-1,The Lion King,1656943394,543638043,1113305351,2019_boxofficemojo_rankings.csv,2019,32.8%,67.2%


In [12]:
df_cleaned['Rank'] = df_cleaned.index + 1
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,filename,year,Domestic %,Foreign %
0,1,Avengers: Endgame,2797800564,858373000,1939427564,2019_boxofficemojo_rankings.csv,2019,30.7%,69.3%
1,2,Star Wars: Episode VII - The Force Awakens,2068223624,936662225,1131561399,2015_boxofficemojo_rankings.csv,2015,45.3%,54.7%
2,3,Avengers: Infinity War,2048359754,678815482,1369544272,2018_boxofficemojo_rankings.csv,2018,33.1%,66.9%
3,4,Jurassic World,1670400637,652270625,1018130012,2015_boxofficemojo_rankings.csv,2015,39%,61%
4,5,The Lion King,1656943394,543638043,1113305351,2019_boxofficemojo_rankings.csv,2019,32.8%,67.2%


In [13]:
df_cleaned['Domestic %'] = df_cleaned['Domestic'] / df_cleaned['Worldwide']
df_cleaned['Foreign %'] = df_cleaned['Foreign'] / df_cleaned['Worldwide']
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,filename,year,Domestic %,Foreign %
0,1,Avengers: Endgame,2797800564,858373000,1939427564,2019_boxofficemojo_rankings.csv,2019,0.306803,0.693197
1,2,Star Wars: Episode VII - The Force Awakens,2068223624,936662225,1131561399,2015_boxofficemojo_rankings.csv,2015,0.452882,0.547118
2,3,Avengers: Infinity War,2048359754,678815482,1369544272,2018_boxofficemojo_rankings.csv,2018,0.331395,0.668605
3,4,Jurassic World,1670400637,652270625,1018130012,2015_boxofficemojo_rankings.csv,2015,0.390488,0.609512
4,5,The Lion King,1656943394,543638043,1113305351,2019_boxofficemojo_rankings.csv,2019,0.328097,0.671903


In [14]:
df_cleaned.dtypes

Rank               int64
Release Group     object
Worldwide          int64
Domestic           int64
Foreign            int64
filename          object
year               int64
Domestic %       float64
Foreign %        float64
dtype: object

In [17]:
df_cleaned.to_csv(output_file, index=False)