# ETL
This notebook is used to extract and transform the data from the CSV files in `/Datasets` and load them into the SQLite database.

In [1]:
# Dependencies
import pandas as pd
from pathlib import Path
import requests

# Import SQL Alchemy
from sqlalchemy import create_engine

# Import and establish Base for which classes will be constructed 
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from sqlalchemy import desc

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float, Boolean

# Local modules
from config import omdb_api_key

## Import Movies CSV
- Source: `Datasets/movies.csv`
- Table: `movies` in `movies_db.sqlite`

In [2]:
# Get the movies data
movies_csv = Path('../Datasets/movies.csv')
movies_df = pd.read_csv(movies_csv)

# Get the datatype from the DataFrame
movies_df.dtypes

movieid           int64
title            object
mpaa_rating      object
budget           object
gross            object
release_date     object
genre            object
runtime          object
rating          float64
rating_count    float64
summary          object
dtype: object

In [3]:
# Get Base
Base = declarative_base()

In [4]:
# Create movies class
class Movies(Base):
    __tablename__ = 'movies'
    movieid = Column(Integer, primary_key=True)
    title = Column(String)
    mpaa_rating = Column(String)
    budget = Column(String)
    gross = Column(String)
    release_date = Column(String)
    genre = Column(String)
    runtime = Column(Integer)
    rating = Column(Float)
    rating_count = Column(Float)
    summary = Column(String)     

In [5]:
# Create a connection to a SQLite database
engine = create_engine('sqlite:///movies_db.sqlite')

# Create the tables within the database
Base.metadata.create_all(engine)
# Start session
session = Session(bind=engine)

In [6]:
# Loop through DataFrame and retrieve data
for index, row in movies_df.iterrows():
    movieid = int(movies_df.loc[index,'movieid'])
    title = movies_df.loc[index,'title']
    mpaa_rating = movies_df.loc[index,'mpaa_rating']
    budget = movies_df.loc[index,'budget']
    gross = movies_df.loc[index,'gross']
    release_date = movies_df.loc[index,'release_date']
    genre = movies_df.loc[index,'genre']
    runtime = movies_df.loc[index,'runtime']
    rating = movies_df.loc[index,'rating']
    rating_count = movies_df.loc[index,'rating_count']
    summary = movies_df.loc[index,'summary']

    # Add data to database
    session.add(Movies(
        movieid = movieid,
        title = title,
        mpaa_rating = mpaa_rating,
        budget = budget,
        gross = gross,
        release_date = release_date,
        genre = genre,
        runtime = runtime,
        rating = rating,
        rating_count = rating_count,
        summary = summary
    ))

print(f"{len(movies_df)} rows ready for commit.")


636 rows ready for commit.


In [8]:
# Commit changes to session
session.commit()

# Close session
session.close()