# Setting Up The SQLite Server
Below is the code to import the sqlite3 library that is included with the Python standard library package.

In [1]:
import sqlite3

Next is the code to create/connect with the database

In [2]:
connection = sqlite3.connect('movies.db')

Now the next code will create a cursor object so we can execute SQL statements

In [3]:
cursor = connection.cursor()

## Drop Tables
Drop all the tables from the database to start fresh.

In [4]:
cursor.execute('DROP TABLE IF EXISTS movies_box_office')

<sqlite3.Cursor at 0x1d5d48c17c0>

## Create Tables 
### Main Table From imdb dataset (movies_box_office)

In [5]:
cursor.execute('''CREATE TABLE IF NOT EXISTS movies_box_office
               (
                    movies_box_office_id INTEGER PRIMARY KEY NOT NULL,
                    movie_name VARCHAR(100) NOT NULL,
                    world_wide_bo INTEGER,
                    domestic_amount INTEGER,
                    domestic_percentage FLOAT,
                    foreign_amount INTEGER,
                    foreign_percentage FLOAT
                )''')

<sqlite3.Cursor at 0x1d5d48c17c0>

In [6]:
import pandas as pd

# List of file names
filenames = ['./datasets/box-office/ranking_summary_2018.csv', './datasets/box-office/ranking_summary_2019.csv', './datasets/box-office/ranking_summary_2020.csv', './datasets/box-office/ranking_summary_2021.csv', './datasets/box-office/ranking_summary_2022.csv']

# Read each CSV file into a DataFrame and store them in a list
dataframes = [pd.read_csv(f) for f in filenames]

# Concatenate all the dataframes into one
combined_dataframe = pd.concat(dataframes, ignore_index=True)

combined_dataframe.reset_index(drop=True, inplace=True)
combined_dataframe.index = combined_dataframe.index + 1

In [7]:
combined_dataframe

Unnamed: 0,rank,title,worldwide,domestic,domestic_pct,foreign,foreign_pct
1,1,Avengers: Infinity War,"$2,048,359,754","$678,815,482",33.1%,"$1,369,544,272",66.9%
2,2,Black Panther,"$1,346,913,161","$700,059,566",52%,"$646,853,595",48%
3,3,Jurassic World: Fallen Kingdom,"$1,308,467,944","$417,719,760",31.9%,"$890,748,184",68.1%
4,4,Incredibles 2,"$1,242,805,359","$608,581,744",49%,"$634,223,615",51%
5,5,Aquaman,"$1,151,961,807","$335,061,807",29.1%,"$816,900,000",70.9%
...,...,...,...,...,...,...,...
996,196,Lesson in Murder,"$7,348,964",-,-,"$7,348,964",100%
997,197,Family Affairs,"$7,330,755",-,-,"$7,330,755",100%
998,198,Listy do M. 5,"$7,328,061",-,-,"$7,328,061",100%
999,199,Laid-Back Camp Movie,"$7,317,913",-,-,"$7,317,913",100%


### Add the Movies to the movies_box_office table
Take the data from the combined_dataframe and place it in rows in the movies_box_office table

In [8]:
import re
import numpy as np

for index, row in combined_dataframe.iterrows():
    # Declare a variable for each field and assign it to value from DF
    movie_bo_id = index
    movie_name = re.sub(r'[^\w\s]', '',row["title"]).upper()
    world_wide_bo = int(re.sub('[^A-Za-z0-9]+', '', row["worldwide"]))
    if row['domestic'] != '-':
        domestic_amount = int(re.sub('[^A-Za-z0-9]+', '', row["domestic"]))
    if row['domestic_pct'] != '-':
        domestic_percentage = float(re.sub('[^A-Za-z0-9\.]+', '', row["domestic_pct"]))
    if row['foreign'] != '-':
        foreign_amount = int(re.sub('[^A-Za-z0-9]+', '', row["foreign"]))
    if row['foreign_pct'] != '-':
        foreign_percentage = float(re.sub('[^A-Za-z0-9\.]+', '', row["foreign_pct"]))
    print(f"{movie_bo_id}, {movie_name}, {world_wide_bo}, {domestic_amount}, {domestic_percentage}, {foreign_amount}, {foreign_percentage}")
    cursor.execute('''INSERT INTO movies_box_office (movies_box_office_id, movie_name, world_wide_bo, domestic_amount, domestic_percentage, foreign_amount, foreign_percentage) 
                     VALUES (?, ?, ?, ?, ?, ?, ?)''', (movie_bo_id, movie_name, world_wide_bo, domestic_amount, domestic_percentage, foreign_amount, foreign_percentage))

1, AVENGERS INFINITY WAR, 2048359754, 678815482, 33.1, 1369544272, 66.9
2, BLACK PANTHER, 1346913161, 700059566, 52.0, 646853595, 48.0
3, JURASSIC WORLD FALLEN KINGDOM, 1308467944, 417719760, 31.9, 890748184, 68.1
4, INCREDIBLES 2, 1242805359, 608581744, 49.0, 634223615, 51.0
5, AQUAMAN, 1151961807, 335061807, 29.1, 816900000, 70.9
6, BOHEMIAN RHAPSODY, 903655259, 216428042, 24.0, 687227217, 76.0
7, VENOM, 856085151, 213515506, 24.9, 642569645, 75.1
8, MISSION IMPOSSIBLE  FALLOUT, 791115104, 220159104, 27.8, 570956000, 72.2
9, DEADPOOL 2, 734546611, 318491426, 43.4, 416055185, 56.6
10, FANTASTIC BEASTS THE CRIMES OF GRINDELWALD, 655755901, 159555901, 24.3, 496200000, 75.7
11, ANTMAN AND THE WASP, 622674139, 216648740, 34.8, 406025399, 65.2
12, READY PLAYER ONE, 583490172, 137690172, 23.6, 445800000, 76.4
13, OPERATION RED SEA, 579330426, 1543547, 0.3, 577786879, 99.7
14, DETECTIVE CHINATOWN 2, 544185156, 1983984, 0.4, 542201172, 99.6
15, RALPH BREAKS THE INTERNET, 529323962, 201091711,

In [10]:
cursor.execute("SELECT * FROM movies_box_office")
rows = cursor.fetchall()

for row in rows:
    print(row)

(1, 'AVENGERS INFINITY WAR', 2048359754, 678815482, 33.1, 1369544272, 66.9)
(2, 'BLACK PANTHER', 1346913161, 700059566, 52.0, 646853595, 48.0)
(3, 'JURASSIC WORLD FALLEN KINGDOM', 1308467944, 417719760, 31.9, 890748184, 68.1)
(4, 'INCREDIBLES 2', 1242805359, 608581744, 49.0, 634223615, 51.0)
(5, 'AQUAMAN', 1151961807, 335061807, 29.1, 816900000, 70.9)
(6, 'BOHEMIAN RHAPSODY', 903655259, 216428042, 24.0, 687227217, 76.0)
(7, 'VENOM', 856085151, 213515506, 24.9, 642569645, 75.1)
(8, 'MISSION IMPOSSIBLE  FALLOUT', 791115104, 220159104, 27.8, 570956000, 72.2)
(9, 'DEADPOOL 2', 734546611, 318491426, 43.4, 416055185, 56.6)
(10, 'FANTASTIC BEASTS THE CRIMES OF GRINDELWALD', 655755901, 159555901, 24.3, 496200000, 75.7)
(11, 'ANTMAN AND THE WASP', 622674139, 216648740, 34.8, 406025399, 65.2)
(12, 'READY PLAYER ONE', 583490172, 137690172, 23.6, 445800000, 76.4)
(13, 'OPERATION RED SEA', 579330426, 1543547, 0.3, 577786879, 99.7)
(14, 'DETECTIVE CHINATOWN 2', 544185156, 1983984, 0.4, 542201172, 99