# Proyecto X

## Lectura de los datos y exploración de los encabezados de los archivos.

In [1]:
import pandas as pd

In [2]:
geolocation = pd.read_csv('/content/Copia de ecommerce_geolocation_dataset.csv')
customers = pd.read_csv('/content/Copia de ecommerce_customers_dataset.csv')
order_items = pd.read_csv('/content/Copia de ecommerce_order_items_dataset.csv')
order_payments = pd.read_csv('/content/Copia de ecommerce_order_payments_dataset.csv')
order_reviews = pd.read_csv('/content/Copia de ecommerce_order_reviews_dataset.csv')
orders = pd.read_csv('/content/Copia de ecommerce_orders_dataset.csv')
products = pd.read_csv('/content/Copia de ecommerce_products_dataset.csv')
sellers = pd.read_csv('/content/Copia de ecommerce_sellers_dataset.csv')
category_names = pd.read_csv('/content/Copia de product_category_name_translation.csv')

In [3]:
print(geolocation.columns.tolist())

['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']


In [4]:
print(customers.columns.tolist())

['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']


In [5]:
print(order_items.columns.tolist())

['order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', 'freight_value']


In [6]:
print(order_payments.columns.tolist())

['order_id', 'payment_sequential', 'payment_type', 'payment_installments', 'payment_value']


In [7]:
print(order_reviews.columns.tolist())

['review_id', 'order_id', 'review_score', 'review_comment_title', 'review_comment_message', 'review_creation_date', 'review_answer_timestamp']


In [8]:
print(orders.columns.tolist())

['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']


In [9]:
print(products.columns.tolist())

['product_id', 'product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']


In [10]:
print(sellers.columns.tolist())

['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state']


In [11]:
print(category_names.columns.tolist())

['product_category_name', 'product_category_name_english']


## BBT1-14 Creación de la base de datos y carga de datos

**Descripción**
Crear una conexión a la base de datos de SQLite llamada ecommerce.db
Importar los archivos y Guardar los DataFrames en la base de datos como tablas:

ecommerce_customers_dataset.csv

ecommerce_order_items_dataset.csv

ecommerce_order_payments_dataset.csv

ecommerce_order_reviews_dataset.csv

ecommerce_orders_dataset.csv

ecommerce_products_dataset.csv

ecommerce_sellers_dataset.csv

product_category_name_translation.csv


In [12]:
import sqlite3 as sql


#Conexion a la base de datos
conn = sql.connect('ecommerce.db')
cursor = conn.cursor()

#script de creacion de tablas
script = '''
CREATE TABLE products (
    product_id TEXT PRIMARY KEY,
    product_category_name TEXT,
    product_name_lenght REAL,
    product_description_lenght REAL,
    product_photos_qty REAL,
    product_weight_g REAL,
    product_length_cm REAL,
    product_height_cm REAL,
    product_width_cm REAL
);
CREATE TABLE order_items (
    order_id TEXT,
    order_item_id INTEGER,
    product_id TEXT,
    seller_id TEXT,
    shipping_limit_date TEXT,
    price REAL,
    freight_value REAL,
    PRIMARY KEY (order_id, order_item_id)
);
CREATE TABLE order_payments (
    order_id TEXT,
    payment_sequential INTEGER,
    payment_type TEXT,
    payment_installments INTEGER,
    payment_value REAL,
    PRIMARY KEY(order_id,payment_sequential)
);
CREATE TABLE orders (
    order_id TEXT PRIMARY KEY,
    customer_id TEXT,
    order_status TEXT,
    order_purchase_timestamp TEXT,
    order_approved_at TEXT,
    order_delivered_carrier_date TEXT,
    order_delivered_customer_date TEXT,
    order_estimated_delivery_date TEXT
);
CREATE TABLE customers (
    customer_id TEXT PRIMARY KEY,
    customer_unique_id TEXT,
    customer_zip_code_prefix INTEGER,
    customer_city TEXT,
    customer_state TEXT
);
CREATE TABLE geolocation(
  geolocation_zip_code_prefix INTEGER PRIMARY KEY ,
  geolocation_lat REAL,
  geolocation_lng REAL,
  geolocation_city TEXT,
  geolocation_state TEXT
);
CREATE TABLE order_review(
  review_id TEXT,
  order_id TEXT,
  review_score INT,
  review_comment_title TEXT,
  review_comment_message TEXT,
  review_creation_date TEXT,
  review_answer_timestamp TEXT
  PRIMARY KEY(order_id,review_id)
);
CREATE TABLE sellers(
  seller_id TEXT PRIMARY KEY,
  seller_zip_code_prefix INTEGER,
  seller_city TEXT,
  seller_state TEXT
);
CREATE TABLE category(
  product_category_name TEXT,
  product_category_name_english TEXT
);
'''
cursor.executescript(script)
conn.commit()


OperationalError: ignored

Inserción de los datos de los csv a la base de datos

In [13]:
geolocation.to_sql('geolocation',conn, index=True, if_exists='replace')
customers.to_sql('customers',conn, index=True, if_exists='replace')
order_items.to_sql('order_items',conn, index=True, if_exists = 'replace')
order_payments.to_sql('order_payments',conn, index=True, if_exists = 'replace')
order_reviews.to_sql('order_review',conn, index=True, if_exists = 'replace')
orders.to_sql('orders',conn, index=True, if_exists = 'replace')
products.to_sql('products',conn, index=True, if_exists = 'replace')
sellers.to_sql('sellers',conn, index=True, if_exists = 'replace')
category_names.to_sql('category',conn, index=True, if_exists = 'replace')

71

Creación de tabla geolocation2 con latitudes y longitudes promedio

Con el fin de evitar la duplicidad de datos usando el criterio de promediar latitudes y longitudes agrupando por código postal.

In [14]:
query_geo = '''
  SELECT geolocation_zip_code_prefix, AVG(geolocation_lat) as promedio_latitud, AVG(geolocation_lng) as promedio_longitud, geolocation_city, geolocation_state
  FROM geolocation
  GROUP BY geolocation_zip_code_prefix
  ORDER BY geolocation_zip_code_prefix;
  '''
geo_2 = pd.read_sql_query(query_geo, conn)
geo_2

geo_2.set_index('geolocation_zip_code_prefix', inplace=True)

In [15]:
geo_2.to_sql('geolocation2', conn, index=True, if_exists='replace')

19015

Creación de tabla geolocation2 con latitudes y longitudes promedio

Con el fin de evitar la duplicidad de datos usando el criterio de promediar latitudes y longitudes agrupando por código postal.

In [16]:
query_geo = '''
  SELECT geolocation_zip_code_prefix, AVG(geolocation_lat) as promedio_latitud, AVG(geolocation_lng) as promedio_longitud, geolocation_city, geolocation_state
  FROM geolocation
  GROUP BY geolocation_zip_code_prefix
  ORDER BY geolocation_zip_code_prefix;
  '''
geo_2 = pd.read_sql_query(query_geo, conn)
print(geo_2.head(5))

geo_2.set_index('geolocation_zip_code_prefix', inplace=True)

   geolocation_zip_code_prefix  promedio_latitud  promedio_longitud  \
0                         1001        -23.550190         -46.634024   
1                         1002        -23.548146         -46.634979   
2                         1003        -23.548994         -46.635731   
3                         1004        -23.549799         -46.634757   
4                         1005        -23.549456         -46.636733   

  geolocation_city geolocation_state  
0        sao paulo                SP  
1        sao paulo                SP  
2        sao paulo                SP  
3        sao paulo                SP  
4        sao paulo                SP  


In [17]:
geo_2.to_sql('geolocation2', conn, index=True, if_exists='replace')

19015

## BBT1-18 - Extracción de datos desde las tablas customers y orders

**Descripción**

COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla customers y tabla orders.

PARA: Extraer los datos de clientes con sus ordenes

In [18]:
query_1 = '''
SELECT c.customer_unique_id,c.customer_zip_code_prefix,c.customer_city, c.customer_state,
      o.customer_id,o.order_status,o.order_purchase_timestamp,o.order_approved_at,o.order_delivered_carrier_date ,o.order_delivered_customer_date ,o.order_estimated_delivery_date
FROM customers AS c
JOIN orders as o ON c.customer_id = o.customer_id;

'''
df_customers_orders = pd.read_sql_query(query_1, conn)

df_customers_orders.head(5)

Unnamed: 0,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,06b8999e2fba1a1fbc88172c00ba8bc7,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,2017-05-25 10:35:35,2017-06-05 00:00:00
1,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,18955e83d337fd6b2def6b18a428ac77,delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-15 17:14:59,2018-01-29 12:41:19,2018-02-06 00:00:00
2,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,4e7b3e00288586ebd08712fdd0374a03,delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-11 14:31:00,2018-06-14 17:58:51,2018-06-13 00:00:00
3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,b2b6027bc5c5109e529d4dc6358b12c3,delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-27 23:22:42,2018-03-28 16:04:25,2018-04-10 00:00:00
4,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,4f2d8ab171c80ec8364f7c12e35b23ad,delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-07-30 15:16:00,2018-08-09 20:55:48,2018-08-15 00:00:00


## BBT1-19  Extracción de datos desde las tablas orders y orders_items

**Descripción**

COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla orders y tabla orders_items.

PARA: Extraer los datos de ordenes con sus items

In [19]:
query_2 = '''
SELECT o.customer_id,o.order_status,o.order_purchase_timestamp,o.order_approved_at,o.order_delivered_carrier_date ,o.order_delivered_customer_date ,o.order_estimated_delivery_date,
       oi.order_id, oi.order_item_id , oi.product_id , oi.seller_id , oi.shipping_limit_date , oi.price  , oi.freight_value
FROM orders AS o
JOIN order_items AS oi ON o.order_id = oi.order_id;
'''
df_orders_ordersitems = pd.read_sql_query(query_2,conn)
df_orders_ordersitems.head(5)

Unnamed: 0,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,e481f51cbdc54678b7cc49136f2d6af7,1,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06 11:07:15,29.99,8.72
1,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,53cdb2fc8bc7dce0b6741e2150273451,1,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,2018-07-30 03:24:27,118.7,22.76
2,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,47770eb9100c2d0c44946d9cf07ec65d,1,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,2018-08-13 08:55:23,159.9,19.22
3,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,949d5b44dbf5de918fe9c16f97b45f8a,1,d0b61bfb1de832b15ba9d266ca96e5b0,66922902710d126a0e7d26b0e3805106,2017-11-23 19:45:59,45.0,27.2
4,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,ad21c59c0840e6cb83a9ceb5573f8159,1,65266b2da20d04dbe00c5c2d3bb7859e,2c9e548be18521d1c43cde1c582c6de8,2018-02-19 20:31:37,19.9,8.72


## BBT1-20  Extracción de datos desde las tablas orders_items y sellers

**Descripción**

COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla orders_items y tabla sellers.

PARA: Extraer los datos de items con por sellers

In [20]:
query_3 = '''
SELECT oi.order_id, oi.order_item_id , oi.product_id , oi.seller_id , oi.shipping_limit_date , oi.price  , oi.freight_value,
        s.seller_zip_code_prefix, s.seller_city, s.seller_state
FROM order_items AS oi
JOIN sellers AS s ON oi.seller_id = s.seller_id;
'''
df_orderitems_sellers = pd.read_sql_query(query_3, conn)
df_orderitems_sellers.head(5)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,seller_zip_code_prefix,seller_city,seller_state
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,27277,volta redonda,SP
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93,3471,sao paulo,SP
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87,37564,borda da mata,MG
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79,14403,franca,SP
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14,87900,loanda,PR


## BBT1-21  Extracción de datos desde las tablas orders_items y products

**Descripción**
COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla orders_items y tabla products.

PARA: Extraer los datos de items con por categorias

In [21]:
BBT1_21_query = '''
SELECT oi.*, p.*
FROM order_items oi
JOIN products p ON oi.product_id = p.product_id
ORDER BY p.product_category_name ASC;
'''

BBT1_21 = pd.read_sql_query(BBT1_21_query, conn)
BBT1_21.head(5)

Unnamed: 0,index,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,index.1,product_id.1,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,123,0046e1d57f4c07c8c92ab26be8c3dfc0,1,ff6caf9340512b8bf6d2a2a6df032cfa,38e6dada03429a47197d5d584d793b41,2017-10-02 15:49:17,7.79,7.78,12613,ff6caf9340512b8bf6d2a2a6df032cfa,,,,,200.0,16.0,5.0,12.0
1,125,00482f2670787292280e0a8153d82467,1,a9c404971d1a5b1cbc2e4070e02731fd,702835e4b785b67a084280efca355756,2017-02-17 16:18:07,7.6,10.96,25314,a9c404971d1a5b1cbc2e4070e02731fd,,,,,700.0,35.0,14.0,11.0
2,132,004f5d8f238e8908e6864b874eda3391,1,5a848e4ab52fd5445cdc07aab1c40e48,c826c40d7b19f62a09e2d7c5e7295ee2,2018-03-06 09:29:25,122.99,15.61,29568,5a848e4ab52fd5445cdc07aab1c40e48,,,,,400.0,20.0,12.0,15.0
3,142,0057199db02d1a5ef41bacbf41f8f63b,1,41eee23c25f7a574dfaf8d5c151dbb12,e5a3438891c0bfdb9394643f95273d8e,2018-01-25 09:07:51,20.3,16.79,30674,41eee23c25f7a574dfaf8d5c151dbb12,,,,,200.0,16.0,2.0,11.0
4,171,006cb7cafc99b29548d4f412c7f9f493,1,e10758160da97891c2fdcbc35f0f031d,323ce52b5b81df2cd804b017b7f09aa7,2018-02-22 13:35:28,56.0,14.14,244,e10758160da97891c2fdcbc35f0f031d,,,,,2200.0,16.0,2.0,11.0


## BBT1-22 Extracción de datos desde las tablas order y payments

**Descripción**
COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla orders y tabla payments.  

PARA: Extraer los datos de órdenes con sus pagos asociados

In [22]:
BBT1_22_query = '''
  SELECT o.*, op.*
  FROM orders o
  JOIN order_payments op ON o.order_id = op.order_id;
'''

BBT1_22 = pd.read_sql_query(BBT1_22_query, conn)
BBT1_22.head(5)

Unnamed: 0,index,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,index.1,order_id.1,payment_sequential,payment_type,payment_installments,payment_value
0,0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,10770,e481f51cbdc54678b7cc49136f2d6af7,1,credit_card,1,18.12
1,0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,44246,e481f51cbdc54678b7cc49136f2d6af7,3,voucher,1,2.0
2,0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,91130,e481f51cbdc54678b7cc49136f2d6af7,2,voucher,1,18.59
3,1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,7652,53cdb2fc8bc7dce0b6741e2150273451,1,boleto,1,141.46
4,2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,11176,47770eb9100c2d0c44946d9cf07ec65d,1,credit_card,3,179.12


## BBT1-23 Extracción de datos desde las tablas customers y geolocation2

**Descripción**
COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla customers y tabla geolocation.
  
PARA: Extraer los datos de customers con sus latitudes y longitudes

In [23]:
query_bbt1_23 = '''
  SELECT customer_unique_id, geolocation_zip_code_prefix, promedio_latitud, promedio_longitud
  FROM customers c
  JOIN geolocation2 g
  ON g.geolocation_zip_code_prefix = c.customer_zip_code_prefix
  GROUP BY customer_unique_id
  ORDER BY customer_zip_code_prefix
  '''
cust_geo = pd.read_sql_query(query_bbt1_23, conn)
cust_geo.head(5)

Unnamed: 0,customer_unique_id,geolocation_zip_code_prefix,promedio_latitud,promedio_longitud
0,0c1a20644f0dc126c3eaff8dbc1bd12c,1003,-23.548994,-46.635731
1,095e7c124c5c1ccb1eb9f731152eae6a,1004,-23.549799,-46.634757
2,968f6d2f674977d88a4b445a5117ccd8,1004,-23.549799,-46.634757
3,57f0ea1c7f6b9ef8615c0a0b8f06fe57,1005,-23.549456,-46.636733
4,84a7776f914ff19505e44effba86455f,1005,-23.549456,-46.636733


## BBT1-24 Extracción de datos desde las tablas orders y geolocation2

**Descripción**
COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla orders y tabla geolocation.
  
PARA: Extraer los datos de órdenes con sus latitudes y longitudes

In [24]:
query_bbt1_24 = '''
  SELECT order_id, geolocation_zip_code_prefix, promedio_latitud, promedio_longitud
  FROM customers c
  JOIN orders o
  ON c.customer_id = o.customer_id
  JOIN geolocation2 g
  ON g.geolocation_zip_code_prefix = c.customer_zip_code_prefix
  ORDER BY geolocation_zip_code_prefix
  LIMIT 10
  '''
orders_geo = pd.read_sql_query(query_bbt1_24, conn)
orders_geo.head(5)

Unnamed: 0,order_id,geolocation_zip_code_prefix,promedio_latitud,promedio_longitud
0,d454d6650d375ebc3f9667a4d2fe161c,1003,-23.548994,-46.635731
1,3fe4ba391eeff167bfdda2c590013b02,1004,-23.549799,-46.634757
2,ed415dd934c5bb15a4a2fd223bb0a43e,1004,-23.549799,-46.634757
3,08cae26133a5501cdf7b13db6bcf39e6,1005,-23.549456,-46.636733
4,ae28255c7b837ba319c9cbf1dfeeea0d,1005,-23.549456,-46.636733


## BBT1-25 Extracción de datos desde las tablas orders y reviews

**Descripción**

COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla orders y tabla reviews.

PARA: Extraer los datos de órdenes con sus scores

In [25]:
query_bbt1_25 = '''
  SELECT o.order_id, review_score
  FROM orders o
  JOIN order_review r
  ON o.order_id = r.order_id
  ORDER BY review_score
  LIMIT 10
  '''
orders_score = pd.read_sql_query(query_bbt1_25, conn)
orders_score.head(5)

Unnamed: 0,order_id,review_score
0,76c6e866289321a7c93b82b54852dc33,1
1,e6ce16cb79ec1d90b1da9085a6118aeb,1
2,acce194856392f074dbf9dada14d8d82,1
3,1790eea0b567cf50911c057cf20f90f9,1
4,6ea2f835b4556291ffdc53fa0b3b95e8,1


## BT1-26 Extracción de datos desde las tablas orders_items, products y products_category_translation

**Descripción**

COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla orders_items, tabla products y product_category_name_translation.  

PARA: Extraer los datos de items con por categorias pero con su nombre traducido en inglés

In [26]:
bt1_26_query = '''
    SELECT oi.*, c.product_category_name_english AS product_name
    FROM order_items AS oi
    INNER JOIN products AS p
        ON oi.product_id = p.product_id
    INNER JOIN category AS c
        ON p.product_category_name = c.product_category_name_english
;
'''
BT1_26 = pd.read_sql_query(bt1_26_query, conn)
BT1_26.head(5)

Unnamed: 0,index,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,product_name
0,0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,cool_stuff
1,1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93,pet_shop
2,16,0009c9a17f916a706d71784483a5d643,1,3f27ac8e699df3d300ec4a5d8c5cf0b2,fcb5ace8bcc92f75707dc0f01a27d269,2018-05-02 09:31:53,639.0,11.34,consoles_games
3,19,000e562887b1f2006d75e0be9558292e,1,5ed9eaf534f6936b51d0b6c5e4d5c2e9,8cbac7e12637ed9cffa18c7875207478,2018-02-28 12:08:37,25.0,16.11,cool_stuff
4,24,0010b2e5201cc5f1ae7e9c6cc8f5bd00,1,5a419dbf24a8c9718fe522b81c69f61a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-09-15 18:04:37,48.9,16.6,cool_stuff


## BT1_27  Extracción de datos desde las tablas orders, payments y customers

**Descripción**

COMO: desarrollador

QUIERO: Crear un DataFrame que contenga el JOIN de la tabla orders, tabla payments y customers.
  
PARA: Extraer los datos de clientes con sus métodos de pagos por ciudad.

In [27]:
bt1_27_query = '''
    SELECT c.customer_unique_id, op.payment_type, c.customer_city, c.customer_state
    FROM customers AS c
    INNER JOIN orders AS o
        ON c.customer_id = o.customer_id
    INNER JOIN order_payments AS op
        ON o.order_id = op.order_id

;
'''
BT1_27 = pd.read_sql_query(bt1_27_query, conn)

BT1_27.groupby(['payment_type'])[['customer_unique_id']].count().sort_values(by='customer_unique_id', ascending=False)

BT1_27.head(5)

Unnamed: 0,customer_unique_id,payment_type,customer_city,customer_state
0,861eff4711a542e4b93843c6dd7febb0,credit_card,franca,SP
1,290c77bc529b7ac935b93aa66c333dc3,credit_card,sao bernardo do campo,SP
2,060e732b5b29e8181a18229c7b0b2b5e,credit_card,sao paulo,SP
3,259dac757896d24d7702b9acbbff3f3c,credit_card,mogi das cruzes,SP
4,345ecd01c38d18a9036ed96c73b8d066,credit_card,campinas,SP


## BBT1-28 Filtrar datos, reducir volumen de customers y orders.

**Descripción**

COMO: desarrollador

QUIERO: Filtrar el DataFrame de clientes y órdenes para resguardar las 3 órdenes más actuales de cada cliente.

PARA: Reducir el volumen de datos y trabajar solo con las últimas órdenes de los clientes.

In [28]:
# Renombrar la columna 'index' a 'customer_order_index' en el DataFrame
df_customers_orders.rename(columns={'index': 'customer_order_index'}, inplace=True)

# Guardar el DataFrame en la base de datos sin incluir el índice
df_customers_orders.to_sql('customers_orders', conn, if_exists='replace', index=False)


99441

In [29]:
query_2 = '''
WITH ComprasClientes AS (
    SELECT
        c.customer_unique_id,
        o.order_id,
        ROW_NUMBER() OVER (PARTITION BY c.customer_unique_id ORDER BY o.order_purchase_timestamp DESC) AS rn
    FROM
        customers AS c
    JOIN
        orders AS o ON c.customer_id = o.customer_id
    JOIN
        order_payments AS op ON o.order_id = op.order_id
    WHERE
        o.order_status != 'canceled'
)
SELECT
    'cliente ' || cc.customer_unique_id || ' compra ' || cc.rn AS cliente_compra,
    cc.order_id
FROM
    ComprasClientes AS cc
WHERE
    cc.rn <= 3
GROUP BY
    cc.customer_unique_id
HAVING
    COUNT(cc.order_id) = 3;


'''

# Ejecutar la consulta y obtener los resultados en un DataFrame
BBT1_31 = pd.read_sql_query(query_2, conn)
BBT1_31.head(5)

Unnamed: 0,cliente_compra,order_id
0,cliente 00a9fd000ff87bb48b989df819c418f5 compra 1,7290f24ffc8f84166426d8727c15af13
1,cliente 00face5c8f7dbb7eefd4112722f26903 compra 1,fefbe15ebcd87ab3fb8577e635a8b31c
2,cliente 012452d40dafae4df401bced74cdb490 compra 1,e49d0607832db7c3b1343764225d8df6
3,cliente 012a218df8995d3ec3bb221828360c86 compra 1,207258d556f9b9784b54de837ae49286
4,cliente 01399b72331afe80cc0c3a982cafb177 compra 1,0ccb5b7bf95c5b642f3717b08c633361


In [31]:

num_filas, num_columnas = BBT1_31.shape

# Imprime el número de filas
print(f'Número de filas: {num_filas}')

Número de filas: 909


## BBT1-29 Filtrar datos, reducir volumen de orders y orders_items

**Descripción**

COMO: desarrollador

QUIERO: Filtrar el DataFrame de órdenes e ítems de órdenes para resguardar los 3 items de mayor importe de compra.

PARA: Reducir el volumen de datos y trabajar solo con los ítems que mas ingreso generan.

In [32]:
# Renombrar la columna 'index' a 'orders_items_index' en el DataFrame
df_orders_ordersitems.rename(columns={'index': 'orders_items_index'}, inplace=True)
df_orders_ordersitems.to_sql('orders_items', conn, if_exists='replace', index=False)

112650

In [33]:
BBT1_29 = '''
WITH TopItems AS (
  SELECT
    oi.order_id,
    oi.order_item_id,
    oi.product_id,
    oi.seller_id,
    oi.shipping_limit_date,
    oi.price,
    oi.freight_value,
    ROW_NUMBER() OVER (PARTITION BY oi.order_id ORDER BY oi.price DESC) AS row_num
  FROM orders_items AS oi
)
SELECT
  o.customer_id,
  o.order_status,
  o.order_purchase_timestamp,
  o.order_approved_at,
  o.order_delivered_carrier_date,
  o.order_delivered_customer_date,
  o.order_estimated_delivery_date,
  ti.order_id,
  ti.order_item_id,
  ti.product_id,
  ti.seller_id,
  ti.shipping_limit_date,
  ti.price,
  ti.freight_value
FROM TopItems AS ti
JOIN orders AS o ON ti.order_id = o.order_id
WHERE ti.row_num <= 3
ORDER BY ti.price DESC;

'''
bt_29 = pd.read_sql_query(BBT1_29, conn)
bt_29.head(5)


Unnamed: 0,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,c6e2731c5b391845f6800c97401a43a9,delivered,2017-02-12 20:37:36,2017-02-12 20:45:12,2017-02-16 09:23:13,2017-03-03 14:23:18,2017-03-09 00:00:00,0812eb902a67711a1cb742b3cdaa65ae,1,489ae2aa008f021502940f251d4cce7f,e3b4998c7a498169dc7bce44e6bb6277,2017-02-16 20:37:36,6735.0,194.31
1,f48d464a0baaea338cb25f816991ab1f,delivered,2018-07-25 18:10:17,2018-07-27 04:05:13,2018-08-03 14:42:00,2018-08-15 14:57:50,2018-08-10 00:00:00,fefacc66af859508bf1a7934eab1e97f,1,69c590f7ffc7bf8db97190b6cb6ed62e,80ceebb4ee9b31afb6c6a916a574a1e2,2018-08-02 04:05:13,6729.0,193.21
2,3fd6777bbce08a352fddd04e4a7cc8f6,delivered,2017-05-24 18:14:34,2017-05-26 02:45:17,2017-05-26 11:20:47,2017-06-05 17:09:48,2017-06-28 00:00:00,f5136e38d1a14a4dbd87dff67da82701,1,1bdf5e6731585cf01aa8169c7028d6ad,ee27a8f15b1dded4d213a468ba4eb391,2017-06-15 02:45:17,6499.0,227.66
3,df55c14d1476a9a3467f131269c2477f,delivered,2017-04-01 15:58:40,2017-04-03 13:25:18,2017-04-11 15:10:34,2017-04-17 11:04:45,2017-05-02 00:00:00,a96610ab360d42a2e5335a3998b4718a,1,a6492cc69376c469ab6f61d8f44de961,59417c56835dd8e2e72f91f809cd4092,2017-04-18 13:25:18,4799.0,151.34
4,24bbf5fd2f2e1b359ee7de94defc4a15,delivered,2017-04-18 18:50:13,2017-04-19 15:50:15,2017-04-25 13:39:59,2017-05-07 10:42:39,2017-05-15 00:00:00,199af31afc78c699f0dbf71fb178d4d4,1,c3ed642d592594bb648ff4a04cee2747,59417c56835dd8e2e72f91f809cd4092,2017-05-09 15:50:15,4690.0,74.34


In [34]:
resultados = bt_29[bt_29['order_id'] == '199af31afc78c699f0dbf71fb178d4d4']
resultados

Unnamed: 0,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
4,24bbf5fd2f2e1b359ee7de94defc4a15,delivered,2017-04-18 18:50:13,2017-04-19 15:50:15,2017-04-25 13:39:59,2017-05-07 10:42:39,2017-05-15 00:00:00,199af31afc78c699f0dbf71fb178d4d4,1,c3ed642d592594bb648ff4a04cee2747,59417c56835dd8e2e72f91f809cd4092,2017-05-09 15:50:15,4690.0,74.34


## BBT1- 30 Filtrar datos, reducir volumen de orders_items y sellers

**Descripción**

COMO: desarrollador

QUIERO: Filtrar el DataFrame de los vendedores e ítems para resguardar los 3 ítems de mayor importe.
  
PARA: Reducir el volumen de datos y trabajar solo con los ítems que mayor ingreso generan por vendedor.

In [35]:
# Renombrar la columna 'index' a 'orders_items_index' en el DataFrame
df_orderitems_sellers.rename(columns={'index': 'orderitems_sellers_index'}, inplace=True)
df_orderitems_sellers.to_sql('orderitems_sellers', conn, if_exists='replace', index=False)

112650

In [36]:
BBT1_30= '''
WITH TopItemsPerSeller AS (
  SELECT
    s.seller_id,
    oi.order_item_id,
    oi.product_id,
    oi.shipping_limit_date,
    oi.price,
    oi.freight_value,
    ROW_NUMBER() OVER (PARTITION BY s.seller_id ORDER BY oi.price DESC) AS row_num
  FROM orderitems_sellers AS oi
  JOIN sellers AS s ON oi.seller_id = s.seller_id
)
SELECT
  ti.seller_id,
--  oi.order_id,
  ti.order_item_id,
  ti.product_id,
  ti.shipping_limit_date,
  ti.price,
  ti.freight_value,
  s.seller_zip_code_prefix,
  s.seller_city,
  s.seller_state
FROM TopItemsPerSeller AS ti
--JOIN orderitems_sellers AS oi ON ti.order_item_id = oi.order_item_id
JOIN sellers AS s ON ti.seller_id = s.seller_id
WHERE ti.row_num <= 3
ORDER BY ti.seller_id, ti.price DESC;

'''
query_30 = pd.read_sql_query(BBT1_30, conn)
query_30.head(5)

Unnamed: 0,seller_id,order_item_id,product_id,shipping_limit_date,price,freight_value,seller_zip_code_prefix,seller_city,seller_state
0,0015a82c2db000af6aaaf3ae2ecb0532,1,a2ff5a97bf95719e38ea2e3b4105bce8,2017-10-12 22:24:16,895.0,21.02,9080,santo andre,SP
1,0015a82c2db000af6aaaf3ae2ecb0532,1,a2ff5a97bf95719e38ea2e3b4105bce8,2017-10-18 14:49:22,895.0,21.02,9080,santo andre,SP
2,0015a82c2db000af6aaaf3ae2ecb0532,1,a2ff5a97bf95719e38ea2e3b4105bce8,2017-10-24 23:56:20,895.0,21.02,9080,santo andre,SP
3,001cca7ae9ae17fb1caed9dfb1094831,1,e251ebd2858be1aa7d9b2087a6992580,2017-05-05 15:10:21,199.0,17.09,29156,cariacica,ES
4,001cca7ae9ae17fb1caed9dfb1094831,1,98a8c2fa16d7239c606640f5555768e4,2018-04-17 11:10:23,169.0,114.62,29156,cariacica,ES


## BBT1-38 Carga de datos, generacion de archivos .csv de customers y orders

**Descripción**

COMO: desarrollador

QUIERO: Generar archivos .csv a partir del Dataframe generado de customers y orders.

Los archivos deben cumplir los siguientes requerimientos:

*     Contener cabecera.
*   Separador “,”.



Ser guardados en un directorio llamado Results\Analysis_A.


In [41]:
#Nombre del archivo.csv
csv_filename = 'BBT1_38.csv'

# Ruta completa del archivo CSV
csv_path = 'Results/Analysis_A/' + csv_filename

BBT1_31.to_csv('BBT1_38.csv', index=False, sep=',', header=True)

print(f'Archivo CSV guardado en: {csv_path}')

Archivo CSV guardado en: Results/Analysis_A/BBT1_38.csv


## BBT1-39 Carga de datos, generacion de archivos .csv de orders y orders_items

**Descripción**

COMO: desarrollador

QUIERO: Generar archivos .csv a partir del Dataframe generado de orders y orders_items.

Los archivos deben cumplir los siguientes requerimientos:
Contener cabecera.

Separador “,”.

Ser guardados en un directorio llamado Results\Analysis_B.



In [38]:
#Nombre del archivo.csv
csv_filename = 'BBT1_39csv'

# Ruta completa del archivo CSV
csv_path = 'Results/Analysis_B/' + csv_filename

bt_29.to_csv('BBT1_39.csv', index=False, sep=',', header=True)

print(f'Archivo CSV guardado en: {csv_path}')

Archivo CSV guardado en: Results/Analysis_B/BBT1_39csv


## BBT1-40 Carga de datos, generacion de archivos .csv de orders_items y sellers

**Descripción**
COMO: desarrollador

QUIERO: Generar archivos .csv a partir del Dataframe generado de orders_items y sellers.

Los archivos deben cumplir los siguientes requerimientos:

*     Contener cabecera.
*   Separador “,”.



Ser guardados en un directorio llamado Results\Analysis_A.

In [39]:
#Nombre del archivo.csv
csv_filename = 'BBT1_40csv'

# Ruta completa del archivo CSV
csv_path = 'Results/Analysis_C/' + csv_filename

query_30.to_csv('BBT1_40.csv', index=False, sep=',', header=True)

print(f'Archivo CSV guardado en: {csv_path}')

Archivo CSV guardado en: Results/Analysis_C/BBT1_40csv


## BBT1-48 Extraccion de datos desde .csv, análisis A

**Descripción**

COMO: desarrollador

QUIERO: Cargar los datos del análisis A generados en un DataFrame.

PARA: Poder analizar el conjunto de datos.

In [42]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=f8f760fb01ff9c07d444b885c2ff67890f8872e6c532ec1eed68acf1dba142ce
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [43]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [45]:
path_bbt1_38 = '/content/BBT1_38.csv'
bbt1_48_spark = spark.read.load(path_bbt1_38, format='csv', sep=',', inferSchema='true', header='true')
bbt1_48_spark.show(5)

+--------------------+--------------------+
|      cliente_compra|            order_id|
+--------------------+--------------------+
|cliente 00a9fd000...|7290f24ffc8f84166...|
|cliente 00face5c8...|fefbe15ebcd87ab3f...|
|cliente 012452d40...|e49d0607832db7c3b...|
|cliente 012a218df...|207258d556f9b9784...|
|cliente 01399b723...|0ccb5b7bf95c5b642...|
+--------------------+--------------------+
only showing top 5 rows



## BBT1-49 Extraccion de datos desde .csv, análisis B

**Descripción**

COMO: desarrollador

QUIERO: Cargar los datos del análisis B generados en un DataFrame.

PARA: Poder analizar el conjunto de datos.

In [46]:
path_bbt1_49 = '/content/BBT1_39.csv'
bbt1_49_spark = spark.read.load(path_bbt1_49, format='csv', sep=',', inferSchema='true', header='true')
bbt1_49_spark.show(5)

+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|
+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|c6e2731c5b391845f...|   delivered|     2017-02-12 20:37:36|2017-02-12 20:45:12|         2017-02-16 09:23:13|          2017-03-03 14:23:

## BBT1-50 Extraccion de datos desde .csv, análisis C

**Descripción**

COMO: desarrollador

QUIERO: Cargar los datos del análisis C generados en un DataFrame.

PARA: Poder analizar el conjunto de datos.

In [47]:
path_bbt1_50 = '/content/BBT1_40.csv'
bbt1_50_spark = spark.read.load(path_bbt1_50, format='csv', sep=',', inferSchema='true', header='true')
bbt1_50_spark.show(5)

+--------------------+-------------+--------------------+-------------------+-----+-------------+----------------------+-----------+------------+
|           seller_id|order_item_id|          product_id|shipping_limit_date|price|freight_value|seller_zip_code_prefix|seller_city|seller_state|
+--------------------+-------------+--------------------+-------------------+-----+-------------+----------------------+-----------+------------+
|0015a82c2db000af6...|            1|a2ff5a97bf95719e3...|2017-10-12 22:24:16|895.0|        21.02|                  9080|santo andre|          SP|
|0015a82c2db000af6...|            1|a2ff5a97bf95719e3...|2017-10-18 14:49:22|895.0|        21.02|                  9080|santo andre|          SP|
|0015a82c2db000af6...|            1|a2ff5a97bf95719e3...|2017-10-24 23:56:20|895.0|        21.02|                  9080|santo andre|          SP|
|001cca7ae9ae17fb1...|            1|e251ebd2858be1aa7...|2017-05-05 15:10:21|199.0|        17.09|                 29156|  ca