Importing libraries

In [54]:
import pandas as pd

import psycopg2
import os

Defining functions.

In [55]:
def create_database():
    try:
        # Connect to default database
        conn = psycopg2.connect('host = localhost dbname = postgres user = postgres password = root')
        conn.set_session(autocommit=True)
        cur = conn.cursor()
        
        # Creating database
        cur.execute('DROP DATABASE IF EXISTS data_modeling') 
        cur.execute('CREATE DATABASE data_modeling')
        
        # Closing connection to database.
        conn.close()
        
        # Connecting to created database.
        conn = psycopg2.connect('host = localhost dbname = data_modeling user = postgres password = root')
        conn.set_session(autocommit=True)
        cur = conn.cursor()

        print('DataBase created name - "data_modeling"')
        
        # Creating tables.
        customer_table_create = ('''CREATE TABLE IF NOT EXISTS customer(
            customer_id VARCHAR PRIMARY KEY,
            customer_unique_id VARCHAR,
            customer_zip_code_prefix INT,
            customer_city VARCHAR,
            customer_state VARCHAR
        )''')
        cur.execute(customer_table_create)
        
        geolocation_table_create = ('''CREATE TABLE IF NOT EXISTS geolocation(
            geolocation_zip_code_prefix VARCHAR,
            geolocation_lat FLOAT,
            geolocation_lng FLOAT,
            geolocation_city VARCHAR,
            geolocation_state VARCHAR
        )''')
        cur.execute(geolocation_table_create)
        
        order_items_table_create = ('''CREATE TABLE IF NOT EXISTS order_items(
            order_id VARCHAR,
            order_item_id INT,
            product_id VARCHAR,
            seller_id VARCHAR,
            shipping_limit_date DATE,
            price FLOAT,
            freight_value FLOAT
        )''')
        cur.execute(order_items_table_create)
        
        payments_table_create = ('''CREATE TABLE IF NOT EXISTS payments(
            order_id VARCHAR,
            payment_sequential int,
            payment_type VARCHAR,
            payment_installments INT,
            payment_value FLOAT
        )''')
        cur.execute(payments_table_create)
        
        reviews_table_create = ('''CREATE TABLE IF NOT EXISTS reviews(
            review_id VARCHAR,
            order_id VARCHAR,
            review_score INT,
            review_comment_title VARCHAR,
            review_comment_message VARCHAR,
            review_creation_date VARCHAR,
            review_answer_timestamp VARCHAR
        )''')
        cur.execute(reviews_table_create)
        
        orders_table_create = ('''CREATE TABLE IF NOT EXISTS orders(
            order_id VARCHAR PRIMARY KEY,
            customer_id VARCHAR,
            order_status VARCHAR,
            order_purchase_timestamp VARCHAR,
            order_approved_at VARCHAR,
            order_delivered_carrier_date VARCHAR,
            order_delivered_customer_date VARCHAR,
            order_estimated_delivery_date VARCHAR
        )''')
        cur.execute(orders_table_create)
        
        products_table_create = ('''CREATE TABLE IF NOT EXISTS products(
            product_id VARCHAR PRIMARY KEY,
            product_category_name VARCHAR,
            product_name_lenght FLOAT,
            product_description_lenght FLOAT,
            product_photos_qty FLOAT,
            product_weight_g FLOAT,
            product_length_cm FLOAT,
            product_height_cm FLOAT,
            product_width_cm FLOAT
        )''')
        cur.execute(products_table_create)
        
        sellers_table_create = ('''CREATE TABLE IF NOT EXISTS sellers(
            seller_id VARCHAR PRIMARY KEY,
            seller_zip_code_prefix int,
            seller_city VARCHAR,
            seller_state VARCHAR
        )''')
        cur.execute(sellers_table_create)
        
        product_category_table_create = ('''CREATE TABLE IF NOT EXISTS product_category(
            product_category_name VARCHAR PRIMARY KEY,
            product_category_name_english VARCHAR
        )''')
        cur.execute(product_category_table_create)
        
    except psycopg2.Error as e:
        print('Error creating DataBase')
        print(e)
    
    return conn, cur

Now we procede to read and import all tables.

In [56]:
data_customers = pd.read_csv('./data/olist_customers_dataset.csv')
data_customers = data_customers.iloc[0:10000,:]
print(data_customers.shape)
data_customers.sample(3)

(10000, 5)


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
4115,6e6dc62a9abb771cb96614a627d5cf4c,b35290b39d5d7004aa3cd32858dc0053,7240,guarulhos,SP
5514,5ffc28dba523459f96579e4af297055b,5a2dabee1b75af89efb14fe554b6044b,5798,sao paulo,SP
239,9559221419cf73261de39b024545b1b3,9aff27e32725e7f7c395e0a79a5d173d,22620,rio de janeiro,RJ


In [57]:
data_geolocation = pd.read_csv('./data/olist_geolocation_dataset.csv')
data_geolocation = data_geolocation.iloc[0:10000,:]
print(data_geolocation.shape)
data_geolocation.sample(3)

(10000, 5)


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
2835,1128,-23.524942,-46.64582,sao paulo,SP
2810,1153,-23.530316,-46.653287,sao paulo,SP
4539,1232,-23.536767,-46.655073,sao paulo,SP


In [58]:
data_order_items = pd.read_csv('./data/olist_order_items_dataset.csv')
data_order_items = data_order_items.iloc[0:10000,:]
print(data_order_items.shape)
data_order_items.sample(3)

(10000, 7)


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
3723,0872d0faafe0cb56e6f2c594cca1522c,1,fb783e3e545937820b57fe539b2c5a6c,da8622b14eb17ae2831f4ac5b9dab84a,2017-10-05 12:28:17,109.9,18.02
3098,06fc554a38ed6385ac2b121049b5cad5,1,7eb96ea641252f20b26fd6878d009a1d,955fee9216a65b617aa5c0531780ce60,2018-06-28 13:56:03,35.0,8.41
5458,0c57cbfd927d9e62af8beee6d48920b7,5,1b0e9a414e59a00b13262ffb248ed9b4,bbaff50f3b708fda865918715276cd87,2018-08-12 21:50:13,26.0,7.44


In [59]:
data_payments = pd.read_csv('./data/olist_order_payments_dataset.csv')
data_payments = data_payments.iloc[0:10000,:]
print(data_payments.shape)
data_payments.sample(3)

(10000, 5)


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
9224,7ffb2f75de37b73b297a0923735af96a,1,credit_card,1,53.48
7356,18898a06fcd1e440c28bd18943ea0e92,1,credit_card,2,132.46
8820,591083bc42b589c7052118aa83118e76,3,voucher,1,20.0


In [60]:
data_reviews = pd.read_csv('./data/olist_order_reviews_dataset.csv')
data_reviews = data_reviews.iloc[0:10000,:]
print(data_reviews.shape)
data_reviews.sample(3)

(10000, 7)


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
4919,19d1973c6952f3934124f06fb57fff34,1fde32a510f950891c57d6e69c9071bd,5,Excelente,Satisfeita com o produto,2018-07-24 00:00:00,2018-07-27 20:15:42
6965,eb0a5e66d996050334014587b1c8699c,04bc36cd51141e11c2cbbd188b6e47ca,5,,,2016-10-21 00:00:00,2016-10-22 00:14:57
2103,3975077f30c1f99bd699ff192885378b,a4763f72a6a9ff012d96a0e2b4d4b1ea,5,,,2018-05-20 00:00:00,2018-05-20 13:56:22


In [61]:
data_orders = pd.read_csv('./data/olist_orders_dataset.csv')
data_orders = data_orders.iloc[0:10000,:]
print(data_orders.shape)
data_orders.sample(3)

(10000, 8)


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
9794,09023a853a37b3ccb3ca46372068748b,2c197ea326b24a3dd7ae5cd7febd4599,delivered,2018-01-30 15:40:06,2018-01-30 15:55:31,2018-02-05 16:32:08,2018-02-21 17:38:05,2018-02-26 00:00:00
2497,a472cc850b21b27f04a2b80546d3910e,eafcc7bf051b9d56a2b9e428a43b6ce6,delivered,2017-12-06 17:39:09,2017-12-06 17:52:56,2017-12-07 17:27:49,2017-12-11 23:10:00,2018-01-03 00:00:00
2330,f1170c260d0d021017645fd94d65d78b,a86b68dd9da561c8b4cf830e2f0f47a3,delivered,2017-02-24 11:24:17,2017-02-24 15:10:17,2017-03-08 14:52:04,2017-03-16 14:33:08,2017-03-27 00:00:00


In [62]:
data_products = pd.read_csv('./data/olist_products_dataset.csv')
data_products = data_products.iloc[0:10000,:]
print(data_products.shape)
data_products.sample(3)

(10000, 9)


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
8403,ddb282389fb7c7594479b90a76e4d884,moveis_decoracao,46.0,2025.0,2.0,600.0,65.0,11.0,13.0
9830,fdad129d8e902058f98862f23dcf6978,informatica_acessorios,35.0,1101.0,2.0,250.0,42.0,2.0,30.0
5802,b5500cdcf54793ca1dfbd654bdfbc20a,beleza_saude,50.0,826.0,1.0,350.0,25.0,12.0,19.0


In [63]:
data_sellers = pd.read_csv('./data/olist_sellers_dataset.csv')
print(data_sellers.shape)
data_sellers.sample(3)

(3095, 4)


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
1723,8dfbc5ff27df12f6cef751d3b9554222,80220,curitiba,PR
1814,da4d149c0ddbac90557103ac0a0ec356,9861,sbc,SP
953,bcd2d7510d58e293f20fad6438c1b314,81540,curitiba,PR


In [64]:
data_product_category = pd.read_csv('./data/product_category_name_translation.csv')
print(data_product_category.shape)
data_product_category.sample(3)

(71, 2)


Unnamed: 0,product_category_name,product_category_name_english
61,musica,music
33,eletrodomesticos,home_appliances
39,livros_tecnicos,books_technical


the relation with tables is given by the next erd (Entity Relationship Diagram).

<img src="./img/entity_relationship_diagram.png">

In [65]:

# We crate database with the defined function obtain connection and cursor to execute queries.
conn, cur = create_database()

DataBase created name - "data_modeling"


<img src="./img/tables.png">

In [66]:
# Now we add the data from datasets to our tables, that we created in the previous step.
insert_query = ('''INSERT INTO customer(
    customer_id,
    customer_unique_id,
    customer_zip_code_prefix,
    customer_city,
    customer_state)
    VALUES (%s, %s, %s, %s, %s)''')
for i, row in data_customers.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/customer_table.png">

In [67]:
insert_query = ('''INSERT INTO geolocation(
    geolocation_zip_code_prefix,
    geolocation_lat,
    geolocation_lng,
    geolocation_city,
    geolocation_state)
    VALUES (%s, %s, %s, %s, %s)''')
for i, row in data_geolocation.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/geolocation_table.png">

In [68]:
insert_query = ('''INSERT INTO order_items(
    order_id,
    order_item_id,
    product_id,
    seller_id,
    shipping_limit_date,
    price,
    freight_value)
    VALUES (%s, %s, %s, %s, %s, %s, %s)''')
for i, row in data_order_items.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/order_items_table.png">

In [69]:
insert_query = ('''INSERT INTO payments(
    order_id,
    payment_sequential,
    payment_type,
    payment_installments,
    payment_value)
    VALUES (%s, %s, %s, %s, %s)''')
for i, row in data_payments.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/payments_table.png">

In [70]:
insert_query = ('''INSERT INTO reviews(
    review_id,
    order_id,
    review_score,
    review_comment_title,
    review_comment_message,
    review_creation_date,
    review_answer_timestamp)
    VALUES (%s, %s, %s, %s, %s, %s, %s)''')
for i, row in data_reviews.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/reviews_table.png">

In [71]:
insert_query = ('''INSERT INTO orders(
    order_id,
    customer_id,
    order_status,
    order_purchase_timestamp,
    order_approved_at,
    order_delivered_carrier_date,
    order_delivered_customer_date,
    order_estimated_delivery_date)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)''')
for i, row in data_orders.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/orders_table.png">

In [72]:
insert_query = ('''INSERT INTO products(
    product_id,
    product_category_name,
    product_name_lenght,
    product_description_lenght,
    product_photos_qty,
    product_weight_g,
    product_length_cm,
    product_height_cm,
    product_width_cm)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)''')
for i, row in data_products.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/products_table.png">

In [73]:
insert_query = ('''INSERT INTO sellers(
    seller_id,
    seller_zip_code_prefix,
    seller_city,
    seller_state)
    VALUES (%s, %s, %s, %s)''')
for i, row in data_sellers.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/sellers_table.png">

In [74]:
insert_query = ('''INSERT INTO product_category(
    product_category_name,
    product_category_name_english)
    VALUES (%s, %s)''')
for i, row in data_product_category.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/product_category_table.png">

In [75]:
# Closing connection.
conn.close()
cur.close()

# Blibliography

- https://www.postgresqltutorial.com/
- https://launchschool.com/books/sql_first_edition
- https://launchschool.com/books/sql