Importing libraries

In [52]:
import pandas as pd

import psycopg2
import os

Defining functions.

In [53]:
def create_database():
    try:
        # Connect to default database
        conn = psycopg2.connect('host = localhost dbname = postgres user = postgres password = root')
        conn.set_session(autocommit=True)
        cur = conn.cursor()
        
        # Creating database
        cur.execute('DROP DATABASE IF EXISTS data_modeling') 
        cur.execute('CREATE DATABASE data_modeling')
        
        # Closing connection to database.
        conn.close()
        
        # Connecting to created database.
        conn = psycopg2.connect('host = localhost dbname = data_modeling user = postgres password = root')
        conn.set_session(autocommit=True)
        cur = conn.cursor()

        print('DataBase created name - "data_modeling"')
        
        # Creating tables.
        customer_table_create = ('''CREATE TABLE IF NOT EXISTS customer(
            customer_id VARCHAR PRIMARY KEY,
            customer_unique_id VARCHAR,
            customer_zip_code_prefix INT,
            customer_city VARCHAR,
            customer_state VARCHAR
        )''')
        cur.execute(customer_table_create)
        
        geolocation_table_create = ('''CREATE TABLE IF NOT EXISTS geolocation(
            geolocation_zip_code_prefix VARCHAR,
            geolocation_lat FLOAT,
            geolocation_lng FLOAT,
            geolocation_city VARCHAR,
            geolocation_state VARCHAR
        )''')
        cur.execute(geolocation_table_create)
        
        order_items_table_create = ('''CREATE TABLE IF NOT EXISTS order_items(
            order_id VARCHAR,
            order_item_id INT,
            product_id VARCHAR,
            seller_id VARCHAR,
            shipping_limit_date DATE,
            price FLOAT,
            freight_value FLOAT
        )''')
        cur.execute(order_items_table_create)
        
        payments_table_create = ('''CREATE TABLE IF NOT EXISTS payments(
            order_id VARCHAR,
            payment_sequential int,
            payment_type VARCHAR,
            payment_installments INT,
            payment_value FLOAT
        )''')
        cur.execute(payments_table_create)
        
        reviews_table_create = ('''CREATE TABLE IF NOT EXISTS reviews(
            review_id VARCHAR,
            order_id VARCHAR,
            review_score INT,
            review_comment_title VARCHAR,
            review_comment_message VARCHAR,
            review_creation_date VARCHAR,
            review_answer_timestamp VARCHAR
        )''')
        cur.execute(reviews_table_create)
        
        orders_table_create = ('''CREATE TABLE IF NOT EXISTS orders(
            order_id VARCHAR PRIMARY KEY,
            customer_id VARCHAR,
            order_status VARCHAR,
            order_purchase_timestamp VARCHAR,
            order_approved_at VARCHAR,
            order_delivered_carrier_date VARCHAR,
            order_delivered_customer_date VARCHAR,
            order_estimated_delivery_date VARCHAR
        )''')
        cur.execute(orders_table_create)
        
        products_table_create = ('''CREATE TABLE IF NOT EXISTS products(
            product_id VARCHAR PRIMARY KEY,
            product_category_name VARCHAR,
            product_name_lenght FLOAT,
            product_description_lenght FLOAT,
            product_photos_qty FLOAT,
            product_weight_g FLOAT,
            product_length_cm FLOAT,
            product_height_cm FLOAT,
            product_width_cm FLOAT
        )''')
        cur.execute(products_table_create)
        
        sellers_table_create = ('''CREATE TABLE IF NOT EXISTS sellers(
            seller_id VARCHAR PRIMARY KEY,
            seller_zip_code_prefix int,
            seller_city VARCHAR,
            seller_state VARCHAR
        )''')
        cur.execute(sellers_table_create)
        
        product_category_table_create = ('''CREATE TABLE IF NOT EXISTS product_category(
            product_category_name VARCHAR PRIMARY KEY,
            product_category_name_english VARCHAR
        )''')
        cur.execute(product_category_table_create)
        
    except psycopg2.Error as e:
        print('Error creating DataBase')
        print(e)
    
    return conn, cur

Now we procede to read and import all tables.

In [54]:
data_customers = pd.read_csv('./data/olist_customers_dataset.csv')
data_customers = data_customers.iloc[0:10000,:]
print(data_customers.shape)
data_customers.sample(3)

(10000, 5)


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
7014,87fa2df087845b8953a52f114686488b,4c14835dd913b9ed86fb3a1179f171e8,54762,camaragibe,PE
7736,6e44eda9d6ef9a3b9252f5848a9da93f,e767f34a12740d2fc014485f4dfe2d5b,86800,apucarana,PR
6217,c675a2376ab75226f62b515ef498ba9b,f498e66a64908ad6fce1e90a0c1515ad,31970,belo horizonte,MG


In [55]:
data_geolocation = pd.read_csv('./data/olist_geolocation_dataset.csv')
data_geolocation = data_geolocation.iloc[0:10000,:]
print(data_geolocation.shape)
data_geolocation.sample(3)

(10000, 5)


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
6195,1259,-23.540911,-46.684967,sao paulo,SP
9491,1310,-23.563258,-46.654166,sao paulo,SP
6921,1223,-23.543805,-46.649205,são paulo,SP


In [56]:
data_order_items = pd.read_csv('./data/olist_order_items_dataset.csv')
data_order_items = data_order_items.iloc[0:10000,:]
print(data_order_items.shape)
data_order_items.sample(3)

(10000, 7)


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
1554,03952349aca310eeba691019501388de,1,47fc9ac35ea613a0fc62033750123d2f,7142540dd4c91e2237acb7e911c4eba2,2017-09-18 02:35:53,84.9,17.84
2455,05861bbf66a8cc3d5cdb6db76fece371,1,4c77d14c542fcd857dddc3df2e82329c,4c498c7345e89aebad651544829beca0,2018-04-24 05:11:27,249.88,8.61
1498,03705afa0a4aabe1a734ddb11c1df3f0,1,b40ec43bdfc6d6fdd65e882066a5c895,897060da8b9a21f655304d50fd935913,2017-10-03 08:49:34,177.0,27.78


In [57]:
data_payments = pd.read_csv('./data/olist_order_payments_dataset.csv')
data_payments = data_payments.iloc[0:10000,:]
print(data_payments.shape)
data_payments.sample(3)

(10000, 5)


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
6969,03f51c9adce893f63936c5146701aa2f,1,credit_card,2,370.57
7650,72074725a693246f2057f0d9e1c4398e,1,credit_card,10,426.26
6999,78edd1ffa7d6932c983c695fbaf976e0,1,credit_card,4,96.88


In [58]:
data_reviews = pd.read_csv('./data/olist_order_reviews_dataset.csv')
data_reviews = data_reviews.iloc[0:10000,:]
print(data_reviews.shape)
data_reviews.sample(3)

(10000, 7)


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
3353,ac2722597127c9c37a5c891e07537d96,13b405c01103fcb4ab199d6a8f7825ef,3,,,2018-02-08 00:00:00,2018-02-08 02:51:41
6336,f346435cf91d5abff8d8865e18240503,25d7504420f738317dcbb29f7e565976,5,,,2018-04-24 00:00:00,2018-04-25 10:57:37
3431,2a4866490609bc97a9389a92f1e2a227,577285c10c95871f906ac5ba6d9d902b,5,,,2018-06-22 00:00:00,2018-06-22 17:01:42


In [59]:
data_orders = pd.read_csv('./data/olist_orders_dataset.csv')
data_orders = data_orders.iloc[0:10000,:]
print(data_orders.shape)
data_orders.sample(3)

(10000, 8)


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
283,c4d2f075fc3bf3a06d1e68e113c35cbd,aa6edca5d83d4c8e9a2262b9847febd6,delivered,2018-08-01 14:58:41,2018-08-01 15:31:17,2018-08-02 16:46:00,2018-08-15 00:18:53,2018-08-27 00:00:00
8851,61ce184c1c73abf33b18c98b0dff1229,7a704fc433bd27f103e167b4898c831d,delivered,2018-07-04 19:48:39,2018-07-05 16:21:59,2018-07-05 15:17:00,2018-07-11 20:28:44,2018-07-31 00:00:00
3478,38d7ca365470a06bd907802a6a62d83a,d3558802f902aaa861259024d95372f8,delivered,2017-11-21 18:35:19,2017-11-21 18:47:27,2017-11-22 22:48:51,2017-12-06 01:48:04,2017-12-11 00:00:00


In [60]:
data_products = pd.read_csv('./data/olist_products_dataset.csv')
data_products = data_products.iloc[0:10000,:]
print(data_products.shape)
data_products.sample(3)

(10000, 9)


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
659,29ea1bc54f78267f6e85b9c99c93e12a,construcao_ferramentas_construcao,58.0,339.0,1.0,7100.0,29.0,8.0,29.0
620,e619ceb6b2a4196772ddaa5c6e0368a7,moveis_decoracao,52.0,646.0,1.0,400.0,16.0,30.0,20.0
3757,72c271a278c54c171d6d007eeac8cf24,papelaria,53.0,696.0,5.0,350.0,26.0,36.0,11.0


In [61]:
data_sellers = pd.read_csv('./data/olist_sellers_dataset.csv')
print(data_sellers.shape)
data_sellers.sample(3)

(3095, 4)


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
351,671585f5a2af58b6e276bc01003c0d2b,80510,curitiba,PR
1469,b45d7ebfb7378630f1bcd74b6ff86ff7,30285,belo horizonte,MG
2406,eaf6d55068dea77334e8477d3878d89e,4660,sao paulo,SP


In [62]:
data_product_category = pd.read_csv('./data/product_category_name_translation.csv')
print(data_product_category.shape)
data_product_category.sample(3)

(71, 2)


Unnamed: 0,product_category_name,product_category_name_english
17,fashion_bolsas_e_acessorios,fashion_bags_accessories
61,musica,music
49,construcao_ferramentas_seguranca,construction_tools_safety


the relation with tables is given by the next erd (Entity Relationship Diagram).

<img src="./img/entity_relationship_diagram.png">

In [63]:

# We crate database with the defined function obtain connection and cursor to execute queries.
conn, cur = create_database()

DataBase created name - "data_modeling"


<img src="./img/tables.png">

In [64]:
# Now we add the data from datasets to our tables, that we created in the previous step.
insert_query = ('''INSERT INTO customer(
    customer_id,
    customer_unique_id,
    customer_zip_code_prefix,
    customer_city,
    customer_state)
    VALUES (%s, %s, %s, %s, %s)''')
for i, row in data_customers.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/customer_table.png">

In [65]:
insert_query = ('''INSERT INTO geolocation(
    geolocation_zip_code_prefix,
    geolocation_lat,
    geolocation_lng,
    geolocation_city,
    geolocation_state)
    VALUES (%s, %s, %s, %s, %s)''')
for i, row in data_geolocation.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/geolocation_table.png">

In [66]:
insert_query = ('''INSERT INTO order_items(
    order_id,
    order_item_id,
    product_id,
    seller_id,
    shipping_limit_date,
    price,
    freight_value)
    VALUES (%s, %s, %s, %s, %s, %s, %s)''')
for i, row in data_order_items.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/order_items_table.png">

In [67]:
insert_query = ('''INSERT INTO payments(
    order_id,
    payment_sequential,
    payment_type,
    payment_installments,
    payment_value)
    VALUES (%s, %s, %s, %s, %s)''')
for i, row in data_payments.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/payments_table.png">

In [68]:
insert_query = ('''INSERT INTO reviews(
    review_id,
    order_id,
    review_score,
    review_comment_title,
    review_comment_message,
    review_creation_date,
    review_answer_timestamp)
    VALUES (%s, %s, %s, %s, %s, %s, %s)''')
for i, row in data_reviews.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/reviews_table.png">

In [69]:
insert_query = ('''INSERT INTO orders(
    order_id,
    customer_id,
    order_status,
    order_purchase_timestamp,
    order_approved_at,
    order_delivered_carrier_date,
    order_delivered_customer_date,
    order_estimated_delivery_date)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)''')
for i, row in data_orders.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/orders_table.png">

In [70]:
insert_query = ('''INSERT INTO products(
    product_id,
    product_category_name,
    product_name_lenght,
    product_description_lenght,
    product_photos_qty,
    product_weight_g,
    product_length_cm,
    product_height_cm,
    product_width_cm)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)''')
for i, row in data_products.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/products_table.png">

In [71]:
insert_query = ('''INSERT INTO sellers(
    seller_id,
    seller_zip_code_prefix,
    seller_city,
    seller_state)
    VALUES (%s, %s, %s, %s)''')
for i, row in data_sellers.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/sellers_table.png">

In [72]:
insert_query = ('''INSERT INTO product_category(
    product_category_name,
    product_category_name_english)
    VALUES (%s, %s)''')
for i, row in data_product_category.iterrows():
    cur.execute(insert_query, list(row))

<img src="./img/product_category_table.png">

In [73]:
# Closing connection.
conn.close()
cur.close()

In [74]:
# Droping Database, its no use to save it.
conn = psycopg2.connect('host = localhost dbname = postgres user = postgres password = root')
conn.set_session(autocommit=True)
cur = conn.cursor()

cur.execute('DROP DATABASE data_modeling;')

# Blibliography

- https://www.postgresqltutorial.com/
- https://launchschool.com/books/sql_first_edition
- https://launchschool.com/books/sql