In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Data Scraping

In [2]:
def format_date(date_string):
    """
    This function converts the date string into a formatted date.
    """
    day = date_string[:2].strip()
    month = date_string[2:-5].strip()
    year = date_string[-5:].strip()
    
    month_dict = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06',
                  'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12'}
    month = month_dict.get(month)
    
    date = f'{year}-{month}-{day.zfill(2)}'
    
    return date

In [3]:
def scrape_box_office(year):
    """
    This function scrapes data of the Egyptian Box Office movies for a specified year.
    """
    movies_table = []
    box_office_table = []
    movie_writers_table ={}
    writers_table = {}
    movie_stars_table ={}
    stars_table = {}
    
    for page in range(1, 53):  # iterate over each of the 52 weeks in the year
        url = f'https://elcinema.com/en/boxoffice/{year}/{page}'
        response = requests.get(url)
        if response.status_code == 200:
            source_code = response.text
            soup = BeautifulSoup(source_code, 'lxml')

            rows = soup.find_all('div', class_='row')
            for row in rows:
                movie_info = []
                box_office_info = []
                
                if row.find('div', class_='columns small-6 medium-5 large-5'):
                    movie = row.find('div', class_='columns small-6 medium-4 large-4').find_all('a')[1]
                    revenue = row.find('div', class_='columns small-6 medium-5 large-5').find_all('li')
                    movie_rank = row.find('div', class_='columns small-2 medium-1 large-1').text.replace('\n', '').strip()
                    movie_title = movie.text.replace('\t', '')
                    movie_path = movie['href']
                    movie_id = movie_path[9:-1]
                    movie_link = f'https://elcinema.com{movie_path}'
                    movie_weekly_revenue = revenue[1].text.replace(' EGP', '').replace(',', '')
                    movie_total_revenue = revenue[3].text.replace(' EGP', '').replace(',', '')

                    movie_response = requests.get(movie_link)
                    if movie_response.status_code == 200:
                        movie_src = movie_response.text
                        movie_soup = BeautifulSoup(movie_src, 'lxml')
                        movie_rating = movie_soup.find('div', class_='stars-orange-60').text
                        genre = movie_soup.find('ul', id='jump-here-genre')
                        movie_genre = genre.find_next('a').text if genre else None
                        
                        ul_elements = movie_soup.find_all('ul', class_='list-separator')
                        movie_length = movie_release_date = movie_age_rating = movie_country = ''
                        for element in ul_elements:
                            for sub_element in element.find_all('li'):
                                if sub_element.text[-7:].strip() == 'minutes':
                                    movie_length = sub_element.text[:-7].strip()
                                        
                            if element.find('li').text == 'Release Date:':   
                                for sub_element in element.find_all('li'):
                                    if sub_element.text.strip()[:5] == 'Egypt':
                                        movie_date = sub_element.text.strip()[7:-1]
                                        movie_release_date = format_date(movie_date)
                            
                            elif element.find('li').text == 'Censorship:':
                                movie_age_rating = element.find('li').find_next('li').text
                                movie_age_rating = movie_age_rating.replace('مصري', '').replace('MPAA', '').replace('تقييمنا', '')
                            
                            elif element.find('li').text == 'Country:':
                                movie_country = element.find('a').text.replace('\t', '')
                        
                        movie_director = ul_elements[4].find_all('a')[0].text.replace('\t', '')
                        
                        for item in ul_elements[5].find_all('a'):
                            if item.text != '(more)':
                                writer_id = item['href'][11:-1]
                                movie_writers_table[writer_id] = movie_id
                                writers_table[writer_id] = item.text.replace('\t', '')
                        
                        for item in ul_elements[6].find_all('a'):
                            if item.text != '(more)':
                                star_id = item['href'][11:-1]
                                movie_stars_table[star_id] = movie_id
                                stars_table[star_id] = item.text.replace('\t', '')
                    
                    movie_info.append(movie_id)
                    movie_info.append(movie_title)
                    movie_info.append(movie_rating)
                    movie_info.append(movie_country)
                    movie_info.append(movie_length)
                    movie_info.append(movie_age_rating)
                    movie_info.append(movie_release_date)
                    movie_info.append(movie_genre)
                    movie_info.append(movie_director)
                    movie_info.append(movie_total_revenue)
                    movies_table.append(movie_info)
                    
                    box_office_info.append(year)
                    box_office_info.append(page)
                    box_office_info.append(movie_path[9:-1])
                    box_office_info.append(movie_rank)
                    box_office_info.append(movie_weekly_revenue)
                    box_office_table.append(box_office_info)
        
        else:
            print('Error! Page not found.')
    
    return movies_table, box_office_table, movie_writers_table, writers_table, movie_stars_table, stars_table

In [4]:
movie_columns = ['movie_id', 'movie_title', 'movie_rating', 'movie_country', 'movie_length', 'movie_age_rating',
                 'movie_release_date', 'movie_genre', 'movie_director', 'movie_total_revenue']
box_office_columns = ['year', 'week', 'movie_id', 'movie_rank', 'movie_weekly_revenue']
movie_writers_columns = ['writer_id', 'movie_id']
writers_columns = ['writer_id', 'writer_name']
movie_stars_columns = ['star_id', 'movie_id']
stars_columns = ['star_id', 'star_name']

In [5]:
movie_13, box_office_13, movie_writers_13, writers_13, movie_stars_13, stars_13 = scrape_box_office(2013)

movies_table_13 = pd.DataFrame(movie_13, columns=movie_columns)
box_office_table_13 = pd.DataFrame(box_office_13, columns=box_office_columns)
movie_writers_table_13 = pd.DataFrame(list(movie_writers_13.items()), columns=movie_writers_columns)
writers_table_13 = pd.DataFrame(list(writers_13.items()), columns=writers_columns)
movie_stars_table_13 = pd.DataFrame(list(movie_stars_13.items()), columns=movie_stars_columns)
stars_table_13 = pd.DataFrame(list(stars_13.items()), columns=stars_columns)

In [6]:
movie_14, box_office_14, movie_writers_14, writers_14, movie_stars_14, stars_14 = scrape_box_office(2014)

movies_table_14 = pd.DataFrame(movie_14, columns=movie_columns)
box_office_table_14 = pd.DataFrame(box_office_14, columns=box_office_columns)
movie_writers_table_14 = pd.DataFrame(list(movie_writers_14.items()), columns=movie_writers_columns)
writers_table_14 = pd.DataFrame(list(writers_14.items()), columns=writers_columns)
movie_stars_table_14 = pd.DataFrame(list(movie_stars_14.items()), columns=movie_stars_columns)
stars_table_14 = pd.DataFrame(list(stars_14.items()), columns=stars_columns)

In [7]:
movie_15, box_office_15, movie_writers_15, writers_15, movie_stars_15, stars_15 = scrape_box_office(2015)

movies_table_15 = pd.DataFrame(movie_15, columns=movie_columns)
box_office_table_15 = pd.DataFrame(box_office_15, columns=box_office_columns)
movie_writers_table_15 = pd.DataFrame(list(movie_writers_15.items()), columns=movie_writers_columns)
writers_table_15 = pd.DataFrame(list(writers_15.items()), columns=writers_columns)
movie_stars_table_15 = pd.DataFrame(list(movie_stars_15.items()), columns=movie_stars_columns)
stars_table_15 = pd.DataFrame(list(stars_15.items()), columns=stars_columns)

In [8]:
movie_16, box_office_16, movie_writers_16, writers_16, movie_stars_16, stars_16 = scrape_box_office(2016)

movies_table_16 = pd.DataFrame(movie_16, columns=movie_columns)
box_office_table_16 = pd.DataFrame(box_office_16, columns=box_office_columns)
movie_writers_table_16 = pd.DataFrame(list(movie_writers_16.items()), columns=movie_writers_columns)
writers_table_16 = pd.DataFrame(list(writers_16.items()), columns=writers_columns)
movie_stars_table_16 = pd.DataFrame(list(movie_stars_16.items()), columns=movie_stars_columns)
stars_table_16 = pd.DataFrame(list(stars_16.items()), columns=stars_columns)

In [9]:
movie_17, box_office_17, movie_writers_17, writers_17, movie_stars_17, stars_17 = scrape_box_office(2017)

movies_table_17 = pd.DataFrame(movie_17, columns=movie_columns)
box_office_table_17 = pd.DataFrame(box_office_17, columns=box_office_columns)
movie_writers_table_17 = pd.DataFrame(list(movie_writers_17.items()), columns=movie_writers_columns)
writers_table_17 = pd.DataFrame(list(writers_17.items()), columns=writers_columns)
movie_stars_table_17 = pd.DataFrame(list(movie_stars_17.items()), columns=movie_stars_columns)
stars_table_17 = pd.DataFrame(list(stars_17.items()), columns=stars_columns)

In [10]:
movie_18, box_office_18, movie_writers_18, writers_18, movie_stars_18, stars_18 = scrape_box_office(2018)

movies_table_18 = pd.DataFrame(movie_18, columns=movie_columns)
box_office_table_18 = pd.DataFrame(box_office_18, columns=box_office_columns)
movie_writers_table_18 = pd.DataFrame(list(movie_writers_18.items()), columns=movie_writers_columns)
writers_table_18 = pd.DataFrame(list(writers_18.items()), columns=writers_columns)
movie_stars_table_18 = pd.DataFrame(list(movie_stars_18.items()), columns=movie_stars_columns)
stars_table_18 = pd.DataFrame(list(stars_18.items()), columns=stars_columns)

In [11]:
movie_19, box_office_19, movie_writers_19, writers_19, movie_stars_19, stars_19 = scrape_box_office(2019)

movies_table_19 = pd.DataFrame(movie_19, columns=movie_columns)
box_office_table_19 = pd.DataFrame(box_office_19, columns=box_office_columns)
movie_writers_table_19 = pd.DataFrame(list(movie_writers_19.items()), columns=movie_writers_columns)
writers_table_19 = pd.DataFrame(list(writers_19.items()), columns=writers_columns)
movie_stars_table_19 = pd.DataFrame(list(movie_stars_19.items()), columns=movie_stars_columns)
stars_table_19 = pd.DataFrame(list(stars_19.items()), columns=stars_columns)

In [12]:
movie_20, box_office_20, movie_writers_20, writers_20, movie_stars_20, stars_20 = scrape_box_office(2020)

movies_table_20 = pd.DataFrame(movie_20, columns=movie_columns)
box_office_table_20 = pd.DataFrame(box_office_20, columns=box_office_columns)
movie_writers_table_20 = pd.DataFrame(list(movie_writers_20.items()), columns=movie_writers_columns)
writers_table_20 = pd.DataFrame(list(writers_20.items()), columns=writers_columns)
movie_stars_table_20 = pd.DataFrame(list(movie_stars_20.items()), columns=movie_stars_columns)
stars_table_20 = pd.DataFrame(list(stars_20.items()), columns=stars_columns)

In [13]:
movie_21, box_office_21, movie_writers_21, writers_21, movie_stars_21, stars_21 = scrape_box_office(2021)

movies_table_21 = pd.DataFrame(movie_21, columns=movie_columns)
box_office_table_21 = pd.DataFrame(box_office_21, columns=box_office_columns)
movie_writers_table_21 = pd.DataFrame(list(movie_writers_21.items()), columns=movie_writers_columns)
writers_table_21 = pd.DataFrame(list(writers_21.items()), columns=writers_columns)
movie_stars_table_21 = pd.DataFrame(list(movie_stars_21.items()), columns=movie_stars_columns)
stars_table_21 = pd.DataFrame(list(stars_21.items()), columns=stars_columns)

In [14]:
movie_22, box_office_22, movie_writers_22, writers_22, movie_stars_22, stars_22 = scrape_box_office(2022)

movies_table_22 = pd.DataFrame(movie_22, columns=movie_columns)
box_office_table_22 = pd.DataFrame(box_office_22, columns=box_office_columns)
movie_writers_table_22 = pd.DataFrame(list(movie_writers_22.items()), columns=movie_writers_columns)
writers_table_22 = pd.DataFrame(list(writers_22.items()), columns=writers_columns)
movie_stars_table_22 = pd.DataFrame(list(movie_stars_22.items()), columns=movie_stars_columns)
stars_table_22 = pd.DataFrame(list(stars_22.items()), columns=stars_columns)

## Data Cleaning

In [15]:
movies_dfs = [movies_table_13, movies_table_14, movies_table_15, movies_table_16, movies_table_17,
              movies_table_18, movies_table_19, movies_table_20, movies_table_21, movies_table_22]
movies_table = pd.concat(movies_dfs, axis=0)

box_office_dfs = [box_office_table_13, box_office_table_14, box_office_table_15, box_office_table_16, box_office_table_17,
                  box_office_table_18, box_office_table_19, box_office_table_20, box_office_table_21, box_office_table_22]
box_office_table = pd.concat(box_office_dfs, axis=0)

movie_writers_dfs = [movie_writers_table_13, movie_writers_table_14, movie_writers_table_15, movie_writers_table_16,
                     movie_writers_table_17, movie_writers_table_18, movie_writers_table_19, movie_writers_table_20,
                     movie_writers_table_21, movie_writers_table_22]
movie_writers_table = pd.concat(movie_writers_dfs, axis=0)

writers_dfs = [writers_table_13, writers_table_14, writers_table_15, writers_table_16, writers_table_17,
               writers_table_18, writers_table_19, writers_table_20, writers_table_21, writers_table_22]
writers_table = pd.concat(writers_dfs, axis=0)

movie_stars_dfs = [movie_stars_table_13, movie_stars_table_14, movie_stars_table_15, movie_stars_table_16,
                   movie_stars_table_17, movie_stars_table_18, movie_stars_table_19, movie_stars_table_20,
                   movie_stars_table_21, movie_stars_table_22]
movie_stars_table = pd.concat(movie_stars_dfs, axis=0)

stars_dfs = [stars_table_13, stars_table_14, stars_table_15, stars_table_16, stars_table_17,
             stars_table_18, stars_table_19, stars_table_20, stars_table_21, stars_table_22]
stars_table = pd.concat(stars_dfs, axis=0)

### Movies Table:

In [16]:
movies_table.sample(5)

Unnamed: 0,movie_id,movie_title,movie_rating,movie_country,movie_length,movie_age_rating,movie_release_date,movie_genre,movie_director,movie_total_revenue
490,2055122,After,7.8,US,110.0,+16,2019-04-17,Romance,Jenny Gage,983431
292,2041500,El Haram El Rabea,7.1,Egypt,92.0,,2016-02-03,Action,Peter Mimi,5845645
137,2047628,Frozen II,8.0,US,103.0,All Ages,2019-11-20,Musical,Chris Buck,12158329
410,2055472,Us,6.4,US,120.0,+16,2019-03-20,Thriller,Jordan Peele,3114337
106,2052844,Lees Baghdad,6.9,Egypt,102.0,+12,2020-01-22,Comedy,Ahmed Khaled Mousa,33224245


In [17]:
movies_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10711 entries, 0 to 1100
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   movie_id             10711 non-null  int64  
 1   movie_title          10711 non-null  object 
 2   movie_rating         10711 non-null  float64
 3   movie_country        10703 non-null  object 
 4   movie_length         10664 non-null  float64
 5   movie_age_rating     10413 non-null  object 
 6   movie_release_date   10652 non-null  object 
 7   movie_genre          10688 non-null  object 
 8   movie_director       10711 non-null  object 
 9   movie_total_revenue  10711 non-null  int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 920.5+ KB


**We need to drop duplicates, parse dates, and fill nulls.**

In [18]:
movies_table.drop_duplicates(inplace=True)

In [19]:
movies_table.movie_release_date = pd.to_datetime(movies_table.movie_release_date)

In [20]:
movies_table.isna().sum()

movie_id                0
movie_title             0
movie_rating            0
movie_country           2
movie_length           12
movie_age_rating       97
movie_release_date     13
movie_genre             7
movie_director          0
movie_total_revenue     0
dtype: int64

**We will fill the nulls depending on info from other online movie databases like IMDB, or simply Google search.**

In [21]:
movies_table[movies_table.movie_country.isna()][['movie_id', 'movie_title']]

Unnamed: 0,movie_id,movie_title
273,2057779,Primal Rage: The Legend of Konga
1074,2071803,Overrun


In [22]:
movies_table.loc[273, 'movie_country'] = 'US'
movies_table.loc[1074, 'movie_country'] = 'US'

In [23]:
movies_table[movies_table.movie_length.isna()][['movie_id', 'movie_title']]

Unnamed: 0,movie_id,movie_title
348,2041706,Ta3weza 2
1185,2071835,Morabba' Bermoda
135,2070764,Maaly Mama
198,2073315,The Smurfs: Amazing Adventures
858,2075028,Hazak Al Youm
864,2075562,Big Trip 2: Special Delivery
909,2076089,Detective Knight: Rogue
924,2065225,Taht Tahdid Al Silah
927,2076309,Gawwezni
935,2076091,Nocebo


In [24]:
movies_table.loc[348, 'movie_length'] = 100
movies_table.loc[1185, 'movie_length'] = 90
movies_table.loc[135, 'movie_length'] = 100
movies_table.loc[198, 'movie_length'] = 90
movies_table.loc[858, 'movie_length'] = 90
movies_table.loc[864, 'movie_length'] = 90
movies_table.loc[909, 'movie_length'] = 105
movies_table.loc[924, 'movie_length'] = 105
movies_table.loc[927, 'movie_length'] = 105
movies_table.loc[935, 'movie_length'] = 96
movies_table.loc[1019, 'movie_length'] = 90
movies_table.loc[1089, 'movie_length'] = 96

In [25]:
movies_table[movies_table.movie_age_rating.isna()][['movie_id', 'movie_title']][:50]

Unnamed: 0,movie_id,movie_title
0,1070901,The Hobbit: An Unexpected Journey
3,2008417,The Twilight Saga: Breaking Dawn – Part 2
6,2011348,The Impossible
12,2011370,Killing Them Softly
38,2012462,Alex Cross
65,2004456,Django Unchained
104,2017785,Gambit
173,2019941,Jack the Giant Slayer
176,2008703,G.I. Joe: Retaliation
204,2009271,The Host


In [26]:
movies_table.loc[0, 'movie_age_rating'] = 'PG-13'
movies_table.loc[3, 'movie_age_rating'] = 'PG-13'
movies_table.loc[6, 'movie_age_rating'] = 'PG-13'
movies_table.loc[12, 'movie_age_rating'] = 'R'
movies_table.loc[38, 'movie_age_rating'] = 'PG-13'
movies_table.loc[65, 'movie_age_rating'] = 'R'
movies_table.loc[104, 'movie_age_rating'] = 'PG-13'
movies_table.loc[173, 'movie_age_rating'] = 'PG-13'
movies_table.loc[176, 'movie_age_rating'] = 'PG-13'
movies_table.loc[204, 'movie_age_rating'] = 'PG-13'
movies_table.loc[213, 'movie_age_rating'] = 'PG-13'
movies_table.loc[319, 'movie_age_rating'] = 'NR'
movies_table.loc[432, 'movie_age_rating'] = 'R'
movies_table.loc[410, 'movie_age_rating'] = 'R'
movies_table.loc[610, 'movie_age_rating'] = 'PG-13'
movies_table.loc[706, 'movie_age_rating'] = 'NR'
movies_table.loc[851, 'movie_age_rating'] = 'TV-14'
movies_table.loc[935, 'movie_age_rating'] = 'PG-13'
movies_table.loc[950, 'movie_age_rating'] = 'TV-14'
movies_table.loc[32, 'movie_age_rating'] = 'R'
movies_table.loc[34, 'movie_age_rating'] = 'R'
movies_table.loc[121, 'movie_age_rating'] = 'NR'
movies_table.loc[137, 'movie_age_rating'] = 'R'
movies_table.loc[173, 'movie_age_rating'] = 'PG-13'
movies_table.loc[745, 'movie_age_rating'] = 'PG-13'
movies_table.loc[839, 'movie_age_rating'] = 'NR'
movies_table.loc[982, 'movie_age_rating'] = 'NR'
movies_table.loc[994, 'movie_age_rating'] = 'NR'
movies_table.loc[1000, 'movie_age_rating'] = 'NR'
movies_table.loc[1029, 'movie_age_rating'] = 'NR'
movies_table.loc[7, 'movie_age_rating'] = 'PG-13'
movies_table.loc[8, 'movie_age_rating'] = 'NR'
movies_table.loc[33, 'movie_age_rating'] = 'NR'
movies_table.loc[107, 'movie_age_rating'] = 'NR'
movies_table.loc[109, 'movie_age_rating'] = 'NR'
movies_table.loc[115, 'movie_age_rating'] = 'NR'
movies_table.loc[162, 'movie_age_rating'] = 'NR'
movies_table.loc[174, 'movie_age_rating'] = 'NR'
movies_table.loc[196, 'movie_age_rating'] = 'R'
movies_table.loc[208, 'movie_age_rating'] = 'NR'
movies_table.loc[233, 'movie_age_rating'] = 'PG'
movies_table.loc[315, 'movie_age_rating'] = 'NR'
movies_table.loc[329, 'movie_age_rating'] = 'R'
movies_table.loc[345, 'movie_age_rating'] = 'NR'
movies_table.loc[385, 'movie_age_rating'] = 'NR'
movies_table.loc[422, 'movie_age_rating'] = 'R'
movies_table.loc[427, 'movie_age_rating'] = 'PG'
movies_table.loc[446, 'movie_age_rating'] = 'NR'
movies_table.loc[465, 'movie_age_rating'] = 'NR'
movies_table.loc[574, 'movie_age_rating'] = 'NR'

In [27]:
movies_table[movies_table.movie_age_rating.isna()][['movie_id', 'movie_title']]

Unnamed: 0,movie_id,movie_title
856,2036554,Amod Faqrey
119,2045712,Ozzy
324,2046518,Petra: The Gate of Time
348,2041706,Ta3weza 2
371,2046590,Hold Your Position
381,2046715,Rabbit School - Guardians of the Golden Egg
409,2045026,The Little Mermaid: Attack of the Pirates
433,2040112,Phobia
447,2046138,Vengeance: A Love Story
473,2043927,The Devil's Candy


In [28]:
movies_table.loc[856, 'movie_age_rating'] = 'NR'
movies_table.loc[119, 'movie_age_rating'] = 'G'
movies_table.loc[324, 'movie_age_rating'] = 'NR'
movies_table.loc[348, 'movie_age_rating'] = 'NR'
movies_table.loc[371, 'movie_age_rating'] = 'NR'
movies_table.loc[381, 'movie_age_rating'] = 'G'
movies_table.loc[409, 'movie_age_rating'] = 'NR'
movies_table.loc[433, 'movie_age_rating'] = 'NR'
movies_table.loc[447, 'movie_age_rating'] = 'TV-MA'
movies_table.loc[473, 'movie_age_rating'] = 'NR'
movies_table.loc[477, 'movie_age_rating'] = 'R'
movies_table.loc[488, 'movie_age_rating'] = 'NR'
movies_table.loc[633, 'movie_age_rating'] = 'NR'
movies_table.loc[635, 'movie_age_rating'] = 'NR'
movies_table.loc[684, 'movie_age_rating'] = 'TV-PG'
movies_table.loc[702, 'movie_age_rating'] = 'NR'
movies_table.loc[843, 'movie_age_rating'] = 'R'
movies_table.loc[858, 'movie_age_rating'] = 'NR'
movies_table.loc[1007, 'movie_age_rating'] = 'NR'
movies_table.loc[1079, 'movie_age_rating'] = 'R'
movies_table.loc[1138, 'movie_age_rating'] = 'NR'
movies_table.loc[1151, 'movie_age_rating'] = 'NR'
movies_table.loc[1167, 'movie_age_rating'] = 'NR'
movies_table.loc[11, 'movie_age_rating'] = 'NR'
movies_table.loc[18, 'movie_age_rating'] = 'TV-Y'
movies_table.loc[153, 'movie_age_rating'] = 'R'
movies_table.loc[288, 'movie_age_rating'] = 'NR'
movies_table.loc[9, 'movie_age_rating'] = 'NR'
movies_table.loc[74, 'movie_age_rating'] = 'PG'
movies_table.loc[96, 'movie_age_rating'] = 'TV-14'
movies_table.loc[347, 'movie_age_rating'] = 'TV-14'
movies_table.loc[356, 'movie_age_rating'] = 'NR'
movies_table.loc[372, 'movie_age_rating'] = 'NR'
movies_table.loc[450, 'movie_age_rating'] = 'NR'
movies_table.loc[476, 'movie_age_rating'] = 'NR'
movies_table.loc[777, 'movie_age_rating'] = 'TV-14'
movies_table.loc[817, 'movie_age_rating'] = 'NR'
movies_table.loc[1039, 'movie_age_rating'] = 'NR'
movies_table.loc[1062, 'movie_age_rating'] = 'NR'
movies_table.loc[1194, 'movie_age_rating'] = 'NR'
movies_table.loc[653, 'movie_age_rating'] = 'TV-MA'
movies_table.loc[625, 'movie_age_rating'] = 'NR'
movies_table.loc[768, 'movie_age_rating'] = 'NR'
movies_table.loc[826, 'movie_age_rating'] = 'TV-Y7'
movies_table.loc[893, 'movie_age_rating'] = 'NR'
movies_table.loc[946, 'movie_age_rating'] = 'TV-14'
movies_table.loc[373, 'movie_age_rating'] = '+18'

In [29]:
movies_table[movies_table.movie_release_date.isna()][['movie_id', 'movie_title']]

Unnamed: 0,movie_id,movie_title
104,2017785,Gambit
610,1142245,Step Up 3D
806,2034097,The Martian
846,2037203,The Last Witch Hunter
848,2028137,Paranormal Activity: The Ghost Dimension
870,2034575,Goosebumps
188,2037206,Zoolander 2
526,2025407,Independence Day: Resurgence
195,2042562,Friend Request
219,2044911,LEAP!


In [30]:
movies_table.loc[104, 'movie_release_date'] = '2013-02-13'
movies_table.loc[610, 'movie_release_date'] = '2010-09-22'
movies_table.loc[806, 'movie_release_date'] = '2015-10-07'
movies_table.loc[846, 'movie_release_date'] = '2015-10-23'
movies_table.loc[848, 'movie_release_date'] = '2015-10-21'
movies_table.loc[870, 'movie_release_date'] = '2015-10-29'
movies_table.loc[188, 'movie_release_date'] = '2016-02-24'
movies_table.loc[526, 'movie_release_date'] = '2016-06-23'
movies_table.loc[195, 'movie_release_date'] = '2016-01-07'
movies_table.loc[219, 'movie_release_date'] = '2016-12-14'
movies_table.loc[301, 'movie_release_date'] = '2017-02-10'
movies_table.loc[409, 'movie_release_date'] = '2016-10-20'
movies_table.loc[473, 'movie_release_date'] = '2016-09-27'

In [31]:
movies_table[movies_table.movie_genre.isna()][['movie_id', 'movie_title']]

Unnamed: 0,movie_id,movie_title
249,2040430,Neaama
370,2049203,The Midnight Man
1117,2059052,"Berlin, I Love You"
680,2061855,Clara
1017,2070868,Seal Team
1160,2070406,Barra El Manhag
1187,2071941,Abu Saddam


In [32]:
movies_table.loc[249, 'movie_genre'] = 'Comedy'
movies_table.loc[370, 'movie_genre'] = 'Horror'
movies_table.loc[1117, 'movie_genre'] = 'Drama'
movies_table.loc[680, 'movie_genre'] = 'Adventure'
movies_table.loc[1017, 'movie_genre'] = 'Comedy'
movies_table.loc[1160, 'movie_genre'] = 'Comedy'
movies_table.loc[1187, 'movie_genre'] = 'Drama'

In [33]:
movies_table.isna().sum().sum()

0

In [34]:
movies_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1881 entries, 0 to 1092
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   movie_id             1881 non-null   int64         
 1   movie_title          1881 non-null   object        
 2   movie_rating         1881 non-null   float64       
 3   movie_country        1881 non-null   object        
 4   movie_length         1881 non-null   float64       
 5   movie_age_rating     1881 non-null   object        
 6   movie_release_date   1881 non-null   datetime64[ns]
 7   movie_genre          1881 non-null   object        
 8   movie_director       1881 non-null   object        
 9   movie_total_revenue  1881 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(5)
memory usage: 226.2+ KB


**As for age rating, because of depending on different sources for our imputation of the null values, in addition to the fact that not all ratings on elcinema.com are from MPAA, we need to standardize all age ratings by converting them into MPAA rating system.**

In [35]:
movies_table.movie_age_rating.unique()

array(['PG-13', 'All Ages', 'NR', '+18', 'R', 'TV-Y', 'NC-17', 'PG', 'G',
       'Adult Supervision', 'Adults Only (18+)', 'Unrated',
       'Family (all ages)', 'TV-MA', 'TV-14', '+12', 'TV-Y7', '+16',
       'TV-PG', '+8'], dtype=object)

In [36]:
rating_mapping = {'G': 'G', 'All Ages': 'G', 'TV-Y': 'G', 'Family (all ages)': 'G', '+8': 'G',
                  'PG': 'PG', 'Adult Supervision': 'PG', 'TV-PG': 'PG', 'TV-Y7': 'PG',
                  'PG-13': 'PG-13', '+12': 'PG-13',
                  'R': 'R', 'TV-14': 'R', '+16': 'R',
                  'NC-17': 'NC-17', '+18': 'NC-17', 'Adults Only (18+)': 'NC-17', 'TV-MA': 'NC-17',
                  'NR': 'NR', 'Unrated': 'NR'}

movies_table['movie_mpaa_rating'] = movies_table.movie_age_rating.map(rating_mapping)

In [37]:
movies_table[['movie_age_rating', 'movie_mpaa_rating']].sample(5)

Unnamed: 0,movie_age_rating,movie_mpaa_rating
596,+12,PG-13
1013,+12,PG-13
622,Adult Supervision,PG
746,R,R
492,+16,R


In [38]:
movies_table.drop('movie_age_rating', axis=1, inplace=True)

In [39]:
movies_table = movies_table[['movie_id', 'movie_title', 'movie_rating', 'movie_country', 'movie_length',
                             'movie_release_date', 'movie_genre', 'movie_mpaa_rating', 'movie_director',
                             'movie_total_revenue']]   # Just changing the order of the columns

### Box Office Table:

In [40]:
box_office_table.sample(5)

Unnamed: 0,year,week,movie_id,movie_rank,movie_weekly_revenue
1023,2018,42,2047783,23,32071
1062,2019,43,2059750,14,81707
252,2015,11,1077548,17,13377
907,2022,44,2052179,11,90448
1167,2019,47,2058878,11,228920


In [41]:
box_office_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10711 entries, 0 to 1100
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   year                  10711 non-null  int64
 1   week                  10711 non-null  int64
 2   movie_id              10711 non-null  int64
 3   movie_rank            10711 non-null  int64
 4   movie_weekly_revenue  10711 non-null  int64
dtypes: int64(5)
memory usage: 502.1 KB


**Data types are assigned correctly and there are no nulls. We just need to drop duplicates.**

In [42]:
box_office_table.drop_duplicates(inplace=True)

In [43]:
box_office_table = box_office_table[['year', 'week', 'movie_rank', 'movie_id', 'movie_weekly_revenue']]

### Movie Writers Table:

In [44]:
movie_writers_table.sample(5)

Unnamed: 0,writer_id,movie_id
370,2116785,2046552
120,2121325,2048460
78,2212792,2072473
23,2078213,2065450
251,2094872,2048672


In [45]:
movie_writers_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3178 entries, 0 to 302
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   writer_id  3178 non-null   int64
 1   movie_id   3178 non-null   int64
dtypes: int64(2)
memory usage: 74.5 KB


**Same as above, we just need to drop duplicates.**

In [46]:
movie_writers_table.drop_duplicates(inplace=True)

In [47]:
movie_writers_table = movie_writers_table[['movie_id', 'writer_id']]

### Writers Table:

In [48]:
writers_table.sample(5)

Unnamed: 0,writer_id,writer_name
94,2119973,Alec Sokolow
48,1051106,Zainab Aziz
50,2123311,Christian Carion
55,2087044,Ben Ketai
282,2106381,Joe Robert Cole


In [49]:
writers_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3178 entries, 0 to 302
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   writer_id    3178 non-null   int64 
 1   writer_name  3178 non-null   object
dtypes: int64(1), object(1)
memory usage: 74.5+ KB


**Again, we will only drop duplicates.**

In [50]:
writers_table.drop_duplicates(inplace=True)

### Movie Stars Table:

In [51]:
movie_stars_table.sample(5)

Unnamed: 0,star_id,movie_id
521,2100846,2042359
457,2024570,2039793
941,2066905,2075128
937,1038960,2036554
643,1100548,1783907


In [52]:
movie_stars_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10233 entries, 0 to 994
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   star_id   10233 non-null  object
 1   movie_id  10233 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 239.8+ KB


**We will investigate why star_id is of data type object instead of int.**

In [53]:
movie_stars_table[pd.to_numeric(movie_stars_table.star_id, errors='coerce').isna()]

Unnamed: 0,star_id,movie_id
299,ork/genre/1,2055754
300,ork/genre/,2055754


**It appears that, for some reason, an error occurred when scraping BTS World Tour movie stars data. <br>
To solve the issue, we will add the id of BTS in one row and drop the other.**

In [54]:
movie_stars_table.loc[299, 'star_id'] = 2162144
movie_stars_table.drop(300, inplace=True)
movie_stars_table.star_id = movie_stars_table.star_id.astype(int)

In [55]:
movie_stars_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10223 entries, 0 to 994
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   star_id   10223 non-null  int32
 1   movie_id  10223 non-null  int64
dtypes: int32(1), int64(1)
memory usage: 199.7 KB


In [56]:
movie_stars_table.drop_duplicates(inplace=True)

In [57]:
movie_stars_table = movie_stars_table[['movie_id', 'star_id']]

### Stars Table:

In [58]:
stars_table.sample(5)

Unnamed: 0,star_id,star_name
566,2114314,Aaron Glenane
83,2162192,Frederic Souterelle
287,2059131,Pallavi Sharda
103,1049333,Zaki Fateen Abdel Wahab
874,2010038,Jesse Plemons


In [59]:
stars_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10233 entries, 0 to 994
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   star_id    10233 non-null  object
 1   star_name  10233 non-null  object
dtypes: object(2)
memory usage: 239.8+ KB


**Same as above, we will investigate this error.**

In [60]:
stars_table[pd.to_numeric(stars_table.star_id, errors='coerce').isna()]

Unnamed: 0,star_id,star_name
299,ork/genre/1,Documentary
300,ork/genre/,Musical


**We will add the correct name and id of BTS in one row and drop the other.**

In [61]:
stars_table.loc[299, 'star_id'] = 2162144
stars_table.loc[299, 'star_name'] = 'BTS'
stars_table.drop(300, inplace=True)
stars_table.star_id = stars_table.star_id.astype(int)

In [62]:
stars_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10223 entries, 0 to 994
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   star_id    10223 non-null  int32 
 1   star_name  10223 non-null  object
dtypes: int32(1), object(1)
memory usage: 199.7+ KB


In [63]:
stars_table.drop_duplicates(inplace=True)

**We are all sit, let's export the tables as csv files.**

In [64]:
movies_table.to_csv('movies_table.csv', index=False)
box_office_table.to_csv('box_office_table.csv', index=False)
movie_writers_table.to_csv('movie_writers_table.csv', index=False)
writers_table.to_csv('writers_table.csv', index=False)
movie_stars_table.to_csv('movie_stars_table.csv', index=False)
stars_table.to_csv('stars_table.csv', index=False)