# Setting Up The SQLite Server
Below is the code to import the sqlite3 library that is included with the Python standard library package.

In [None]:
import sqlite3

Next is the code to create/connect with the database

In [None]:
connection = sqlite3.connect('movies.db')

Now the next code will create a cursor object so we can execute SQL statements

In [None]:
cursor = connection.cursor()

## Drop Tables
Drop all the tables from the database to start fresh.

In [None]:
cursor.execute('DROP TABLE IF EXISTS movies_box_office')

## Create Tables 
### Main Table From imdb dataset (movies_box_office)

In [None]:
cursor.execute('''CREATE TABLE IF NOT EXISTS movies_box_office
               (
                    movies_box_office_id INTEGER PRIMARY KEY NOT NULL,
                    movie_name VARCHAR(100) NOT NULL,
                    world_wide_bo INTEGER,
                    domestic_amount INTEGER,
                    domestic_percentage FLOAT,
                    foreign_amount INTEGER,
                    foreign_percentage FLOAT
                )''')

In [None]:
import pandas as pd

# List of file names
filenames = ['./datasets/box-office/ranking_summary_2018.csv', './datasets/box-office/ranking_summary_2019.csv', './datasets/box-office/ranking_summary_2020.csv', './datasets/box-office/ranking_summary_2021.csv', './datasets/box-office/ranking_summary_2022.csv']

# Read each CSV file into a DataFrame and store them in a list
dataframes = [pd.read_csv(f) for f in filenames]

# Concatenate all the dataframes into one
combined_dataframe = pd.concat(dataframes, ignore_index=True)

combined_dataframe.reset_index(drop=True, inplace=True)
combined_dataframe.index = combined_dataframe.index + 1

In [None]:
combined_dataframe

### Add the Movies to the movies_box_office table
Take the data from the combined_dataframe and place it in rows in the movies_box_office table

In [None]:
import re
import numpy as np

for index, row in combined_dataframe.iterrows():
    # Declare a variable for each field and assign it to value from DF
    movie_bo_id = index
    movie_name = re.sub(r'[^\w\s]', '',row["title"]).upper()
    world_wide_bo = int(re.sub('[^A-Za-z0-9]+', '', row["worldwide"]))
    if row['domestic'] != '-':
        domestic_amount = int(re.sub('[^A-Za-z0-9]+', '', row["domestic"]))
    if row['domestic_pct'] != '-':
        domestic_percentage = float(re.sub('[^A-Za-z0-9\.]+', '', row["domestic_pct"]))
    if row['foreign'] != '-':
        foreign_amount = int(re.sub('[^A-Za-z0-9]+', '', row["foreign"]))
    if row['foreign_pct'] != '-':
        foreign_percentage = float(re.sub('[^A-Za-z0-9\.]+', '', row["foreign_pct"]))
    print(f"{movie_bo_id}, {movie_name}, {world_wide_bo}, {domestic_amount}, {domestic_percentage}, {foreign_amount}, {foreign_percentage}")
    cursor.execute('''INSERT INTO movies_box_office (movies_box_office_id, movie_name, world_wide_bo, domestic_amount, domestic_percentage, foreign_amount, foreign_percentage) 
                     VALUES (?, ?, ?, ?, ?, ?, ?)''', (movie_bo_id, movie_name, world_wide_bo, domestic_amount, domestic_percentage, foreign_amount, foreign_percentage))

connection.commit()

In [None]:
cursor.execute("SELECT * FROM movies_box_office")
rows = cursor.fetchall()

for row in rows:
    print(row)

In [None]:
connection.close()