### 4. Load movies

In [8]:
# import necessary libraries
import json
import re

In [9]:
def load_data(path):
    with open(path, encoding='utf-8') as f:
        return json.load(f)

movies = load_data('./dataset/movies.json')

### 5. Data Cleaning

#### Subtasks
- [x] Convert running time to number of minutes
- [x] Convert box office and budget to number. 
- Convert release date to python date. If there is only one release date in array use that one. If there are more than one release date take US release date.
- [x] Remove references [1] [2] etc. `(Done while loading the data)`
- [x] Converting ascii code to readable text while saving the file as json. `(Done while loading the data)`
- [x] Get rid of all errors while scraping the data. `(Done while loading the data)`

In [10]:
# Looking into Running time to find out what type of data is used in Running time
for movie in movies[75:125]:
    print(movie.get('Running time', None))

92 minutes
131 mins.
87 minutes
116 minutes
93 minutes
110 min.
110 min.
131 minutes
101 minutes
108 minutes
84 minutes
78 minutes
75 minutes
['164 minutes', '(', 'Los Angeles', 'premiere)', '144 minutes', '(', 'New York City', 'premiere)', '118 minutes', '(General release)', '172 minutes', '(', "Director's cut", ')']
106 minutes
110 minutes
99 minutes
113 mins.
108 minutes
112 minutes
93 minutes
91 minutes
93 minutes
100 minutes
100 minutes
79 minutes
96 minutes
113 minutes
89 minutes
['118 minutes (1971 original version)', '139 minutes (1996 reconstruction version)']
92 minutes
88 minutes
92 minutes
87 minutes
93 minutes
93 minutes
93 minutes
90 Minutes
83 minutes
96 minutes
88 minutes
89 minutes
91 minutes
93 minutes
92 minutes
97 minutes
100 minutes
100 minutes
89 minutes
91 minutes


In [11]:
# Converting Running time to number of minutes
def get_min_in_num(min):
    result = re.search('[0-9]{1,} +?(?=min)' , min) # extracting the number before the character 'min'
    if result:
        return int(result.group())
    return None

def minutes_to_integers(min):
    islist = isinstance(min, list)
    if min == None:
        return None
    elif islist:
        return get_min_in_num(min[0])
    else:
        return get_min_in_num(min)

for movie in movies:
    movie['Running time (int)'] = minutes_to_integers(movie.get('Running time', None))

# print sample movies
for movie in movies[:2]:
    print(movie)
    print('====')

{'title': 'Academy Award Review of Walt Disney Cartoons', 'Productioncompany ': 'Walt Disney Productions', 'Release date': ['May 19, 1937'], 'Running time': '41 minutes (74 minutes 1966 release)', 'Country': 'United States', 'Language': 'English', 'Box office': '$45.472', 'Running time (int)': 41}
====
{'title': 'Snow White and the Seven Dwarfs', 'Directed by': ['David Hand', 'William Cottrell', 'Wilfred Jackson', 'Larry Morey', 'Perce Pearce', 'Ben Sharpsteen'], 'Written by': ['Ted Sears', 'Richard Creedon', 'Otto Englander', 'Dick Rickard', 'Earl Hurd', 'Merrill De Maris', 'Dorothy Ann Blank', 'Webb Smith'], 'Based on': ['Snow White', 'by The', 'Brothers Grimm'], 'Produced by': 'Walt Disney', 'Starring': ['Adriana Caselotti', 'Lucille La Verne', 'Harry Stockwell', 'Roy Atwell', 'Pinto Colvig', 'Otis Harlan', 'Scotty Mattraw', 'Billy Gilbert', 'Eddie Collins', 'Moroni Olsen', 'Stuart Buchanan'], 'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'], 'Productioncompany ': 'Wal

In [136]:
for index, movie in enumerate(movies[:-50]):
    print(movie.get('Box office', None))

$45.472
$418 million
$164 million
$76.4–$83.3 million (United States and Canada)
$960,000 (worldwide rentals) 
>$1.3 million (est. United States/Canada rentals, 1941)
$267.4 million
$1.135 million (worldwide rentals) 
$799,000
$3.355 million (worldwide rentals)
$3.275 million (worldwide rentals)
$65 million
$3.165 million (worldwide rentals)
$2.56 million (worldwide rentals)
$3.7 million (U.S. rental) $575,000 (foreign rental) 
$1.625 million (worldwide rentals)
$263.6 million
$4,100,000 (worldwide rentals) 
['$2.4 million (1951, domestic)', '$3.5 million (1974, domestic)']
$2.1 million (US rentals)
$87.4 million (United States and Canada)
$1 million (US)
$2.6 million (US)
None
$1.75 million (US and Canadian rentals)
$28.2 million
$2,150,000 (US)
$187 million
$2.1 million (US)
$1.6 million (US)
$1.7 million (US)
None
None
$2.75 million (US)
None
$1.75 million (US rentals)
$6,250,000 (US/Canada rentals)
None
$1.8 million (est. US/ Canada rentals)
$2.5 million (est. US/ Canada rentals)
$

In [147]:
word_pattern = r'thousand|million|billion' # r = raw string
num_pattern = r'\d+(,\d{3})*\.*\d*' 
# \d+ = one or more digits
# (,*\d{3})* optional group of one comma followed by 3 digits
# \.*d* zero or more period followed by digits (escaping '.' as it's special regex character)

num_regex = rf"\${num_pattern}"
word_regex = rf"\${num_pattern}(-|\sto\s)?({num_pattern})?\s({word_pattern})"
# ? is optional in regex
#(-|\sto\s)?({number_pattern})? is checking number-number or(|) number to number

def word_to_number(place_value):
    place_value_dict = {
        'thousand': 1000,
        'million': 1000000,
        'billion': 1000000000
    }
    return place_value_dict[place_value]
    
def parse_word_syntax(money):
    place_value =  re.search(word_pattern, money, flags=re.I).group().lower()
    place_value_num = word_to_number(place_value)
    only_number = parse_number_syntax(money)
    return only_number * place_value_num

def parse_number_syntax(money):
    money_str = re.search(num_pattern, money).group()
    money_float = float(money_str.replace(',',''))
    return money_float

def money_conversion(money):
    if money: # checking if money is not None
        if isinstance(money, list):
            money = money[0]
        word_syntax = re.search(word_regex, money, flags=re.I) # re.I ignores case
        number_syntax = re.search(num_regex, money)
        if word_syntax:
            return parse_word_syntax(word_syntax.group())
        elif number_syntax:
            return parse_number_syntax(number_syntax.group())
        else:
            return None

# parse_number_syntax('$790,100')
# parse_word_syntax('$790 Millions')

for movie in movies:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', None))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', None))
    
# printing samples    
for index, movie in enumerate(movies[75:125]):
    print(movie.get('Box office', None), ' = ',movie.get('Box office (float)', None))
    print('========================================================================')
    print(movie.get('Budget', None), ' = ',movie.get('Budget (float)', None))

$1,275,000  =  1275000.0
None  =  None
$3,500,000 (US/ Canada rentals)  =  3500000.0
None  =  None
$4,000,000 (US/Canada rentals)  =  4000000.0
None  =  None
$28,068,222  =  28068222.0
None  =  None
$6.2 million (est. US/ Canada rentals)  =  6200000.0
None  =  None
$22,565,634  =  22565634.0
None  =  None
None  =  None
None  =  None
$16,207,116  =  16207116.0
None  =  None
$3,000,000 (US/ Canada)  =  3000000.0
None  =  None
$1,900,000 (US/ Canada)  =  1900000.0
None  =  None
$4,000,000 (US/ Canada)  =  4000000.0
None  =  None
$378 million  =  378000000.0
$4 million  =  4000000.0
None  =  None
None  =  None
$5 million (U.S./Canada rentals)  =  5000000.0
$5 million  =  5000000.0
$21,540,050  =  21540050.0
None  =  None
$2,250,000 (US/ Canada)  =  2250000.0
None  =  None
$4,150,000 (US/ Canada rentals)  =  4150000.0
None  =  None
$3.3 million  (US/ Canada rentals)  =  3300000.0
None  =  None
$51,264,000  =  51264000.0
$5 million  =  5000000.0
$1.3 million (US/ Canada rentals)  =  1300000.