In [1]:
import io
import pandas as pd
import numpy as np
import boto3
import psycopg2
import configparser
import mysql.connector as mysqlC
import pymysql

In [2]:
config = configparser.ConfigParser()
config.read('escec.cfg')
RDS_HOST = 'db-rent-cars.chf84lio5m7c.us-east-1.rds.amazonaws.com'

In [3]:
mysql_driver = f"""mysql+pymysql://{config.get('RDS_MYSQL', 'DB_USER')}:{config.get('RDS_MYSQL', 'DB_PASSWORD')}@{RDS_HOST}:{config.get('RDS_MYSQL', 'DB_PORT')}/{config.get('RDS_MYSQL', 'DB_NAME')}"""  

#### Customer dim

In [48]:
sql_query = 'SELECT * FROM customer;'
df_customer = pd.read_sql(sql_query, mysql_driver)
df_customer.head()

Unnamed: 0,id,first_name,last_name,dob,driver_license_number,email,phone
0,1,Kelby,Matterdace,1974-05-22,V435899293,kmatterdace0@oracle.com,181-441-7828
1,2,Orion,De Hooge,1992-08-07,Z140530509,odehooge1@quantcast.com,948-294-5458
2,3,Sheena,Macias,1981-03-10,W045654959,smacias3@amazonaws.com,
3,4,Irving,Packe,1994-12-19,O232196823,ipacke4@cbc.ca,157-815-8064
4,5,Kass,Humphris,1993-12-16,G055017319,khumphris5@xrea.com,510-624-4189


In [49]:
sql_query = 'SELECT * FROM rental;'
df_rental = pd.read_sql(sql_query, mysql_driver)
df_rental.head()

Unnamed: 0,id,start_date,end_date,customer_id,vehicle_type_id,fuel_option_id,pickup_location_id,drop_off_location_id
0,1,2018-07-14,2018-07-23,1,2,1,3,5
1,2,2018-07-10,2018-07-12,2,1,2,1,2
2,3,2018-06-15,2018-07-20,3,1,3,4,6
3,4,2018-06-09,2018-07-02,4,4,2,2,7
4,5,2018-07-24,2018-07-27,5,3,3,5,3


In [50]:
#Agrega date keys
df_rental['start_date_key'] = pd.to_datetime(df_rental['start_date'], format='%Y-%m-%d').dt.strftime('%Y%m%d').astype(int)
df_rental['end_date_key'] = pd.to_datetime(df_rental['end_date'], format='%Y-%m-%d').dt.strftime('%Y%m%d').astype(int)

In [51]:
df_ren_cus = df_rental.merge(df_customer, 
                                                                    left_on='customer_id',
                                                                    right_on='id',
                                                                    how='left',
                                                                    suffixes=('_ren','_cus'))

df_ren_cus.head()

Unnamed: 0,id_ren,start_date,end_date,customer_id,vehicle_type_id,fuel_option_id,pickup_location_id,drop_off_location_id,start_date_key,end_date_key,id_cus,first_name,last_name,dob,driver_license_number,email,phone
0,1,2018-07-14,2018-07-23,1,2,1,3,5,20180714,20180723,1,Kelby,Matterdace,1974-05-22,V435899293,kmatterdace0@oracle.com,181-441-7828
1,2,2018-07-10,2018-07-12,2,1,2,1,2,20180710,20180712,2,Orion,De Hooge,1992-08-07,Z140530509,odehooge1@quantcast.com,948-294-5458
2,3,2018-06-15,2018-07-20,3,1,3,4,6,20180615,20180720,3,Sheena,Macias,1981-03-10,W045654959,smacias3@amazonaws.com,
3,4,2018-06-09,2018-07-02,4,4,2,2,7,20180609,20180702,4,Irving,Packe,1994-12-19,O232196823,ipacke4@cbc.ca,157-815-8064
4,5,2018-07-24,2018-07-27,5,3,3,5,3,20180724,20180727,5,Kass,Humphris,1993-12-16,G055017319,khumphris5@xrea.com,510-624-4189


In [7]:
s3 = boto3.resource(
    service_name = 's3',
    region_name = 'us-east-1',
    aws_access_key_id = config.get('IAM', 'ACCESS_KEY'),
    aws_secret_access_key = config.get('IAM', 'SECRET_ACCESS_KEY')
)

In [8]:
S3_BUCKET_NAME = 'bucket-23000712'
fileList = []
for objt in s3.Bucket(S3_BUCKET_NAME).objects.all():
    fileList.append(objt.key)

fileList

['conversion_divisas.xlsx', 'dim_date.xlsx', 'dim_state.xlsx']

In [52]:
try:
    file = s3.Bucket(S3_BUCKET_NAME).Object('dim_state.xlsx').get()
    data = file['Body'].read()
    df_dim_state = pd.read_excel(io.BytesIO(data), engine='openpyxl')
except Exception as ex:
    print("No es un archivo.")
    print(ex)

df_dim_state.head()

Unnamed: 0,id,state
0,AL,Alabama
1,AK,Alaska
2,AS,American Samoa
3,AZ,Arizona
4,AR,Arkansas


In [53]:
try:
    file = s3.Bucket(S3_BUCKET_NAME).Object('dim_date.xlsx').get()
    data = file['Body'].read()
    df_dim_date = pd.read_excel(io.BytesIO(data), engine='openpyxl')
except Exception as ex:
    print("No es un archivo.")
    print(ex)


# obtiene solo las columnas listadas
df_dim_date = df_dim_date[['date key','full date','day of week','day num in month',	'day name','quarter','year']]

# renombra las columnas
df_dim_date = df_dim_date.rename(columns={'date key':'date_key','full date':'full_date','day of week':'day_of_week','day num in month':'day_num_in_month',	'day name':'day_name','quarter':'quarter','year':'year'})

df_dim_date.head()

Unnamed: 0,date_key,full_date,day_of_week,day_num_in_month,day_name,quarter,year
0,20150101,2015-01-01,4,1,Thursday,1,2015
1,20150102,2015-01-02,5,2,Friday,1,2015
2,20150103,2015-01-03,6,3,Saturday,1,2015
3,20150104,2015-01-04,7,4,Sunday,1,2015
4,20150105,2015-01-05,1,5,Monday,1,2015


In [54]:
sql_query = 'SELECT * FROM location;'
df_location = pd.read_sql(sql_query, mysql_driver)
df_location.head()

Unnamed: 0,id,street_address,city,state,zipcode
0,1,1001 Henderson St,Fort Worth,TX,76102
1,2,300 Reunion Blvd,Dallas,TX,75207
2,3,5911 Blair Rd NW,Washington,DC,20011
3,4,9217 Airport Blvd,Los Angeles,CA,90045
4,5,310 E 64th St,New York,NY,10021


#### Location dim

In [55]:
df_dim_location = df_location.merge(df_dim_state, 
                                                                    left_on='state',
                                                                    right_on='id',
                                                                    how='left',
                                                                    suffixes=('_loc','_sta'))

df_dim_location.head()

Unnamed: 0,id_loc,street_address,city,state_loc,zipcode,id_sta,state_sta
0,1,1001 Henderson St,Fort Worth,TX,76102,TX,Texas
1,2,300 Reunion Blvd,Dallas,TX,75207,TX,Texas
2,3,5911 Blair Rd NW,Washington,DC,20011,DC,Disctrict of Columbia
3,4,9217 Airport Blvd,Los Angeles,CA,90045,CA,California
4,5,310 E 64th St,New York,NY,10021,NY,New York


In [57]:
df_ren_loc = df_ren_cus.merge(df_dim_location, 
                                                                    left_on='pickup_location_id',
                                                                    right_on='id_loc',
                                                                    how='left',
                                                                    suffixes=('_ren','_loc'))

df_ren_loc.head()

Unnamed: 0,id_ren,start_date,end_date,customer_id,vehicle_type_id,fuel_option_id,pickup_location_id,drop_off_location_id,start_date_key,end_date_key,...,driver_license_number,email,phone,id_loc,street_address,city,state_loc,zipcode,id_sta,state_sta
0,1,2018-07-14,2018-07-23,1,2,1,3,5,20180714,20180723,...,V435899293,kmatterdace0@oracle.com,181-441-7828,3,5911 Blair Rd NW,Washington,DC,20011,DC,Disctrict of Columbia
1,2,2018-07-10,2018-07-12,2,1,2,1,2,20180710,20180712,...,Z140530509,odehooge1@quantcast.com,948-294-5458,1,1001 Henderson St,Fort Worth,TX,76102,TX,Texas
2,3,2018-06-15,2018-07-20,3,1,3,4,6,20180615,20180720,...,W045654959,smacias3@amazonaws.com,,4,9217 Airport Blvd,Los Angeles,CA,90045,CA,California
3,4,2018-06-09,2018-07-02,4,4,2,2,7,20180609,20180702,...,O232196823,ipacke4@cbc.ca,157-815-8064,2,300 Reunion Blvd,Dallas,TX,75207,TX,Texas
4,5,2018-07-24,2018-07-27,5,3,3,5,3,20180724,20180727,...,G055017319,khumphris5@xrea.com,510-624-4189,5,310 E 64th St,New York,NY,10021,NY,New York


#### Vehicles dim

In [58]:
sql_query = 'SELECT * FROM vehicle;'
df_vehicle = pd.read_sql(sql_query, mysql_driver)
df_vehicle.head()

Unnamed: 0,id,brand,model,model_year,mileage,color,vehicle_type_id,current_location_id
0,1,Nissan,Versa,2016,65956,white,1,1
1,2,Mitsubishi,Mirage,2017,55864,light blue,1,6
2,3,Chevrolet,Cruze,2017,45796,dark gray,2,5
3,4,Hyundai,Elantra,2018,35479,black,2,1
4,5,Volkswagen,Jetta,2019,2032,light gray,3,3


In [59]:
sql_query = 'SELECT * FROM vehicle_type;'
df_vehicle_type = pd.read_sql(sql_query, mysql_driver)
df_vehicle_type.head()

Unnamed: 0,id,name,rental_value
0,1,Economy,26.77
1,2,Intermediate,29.45
2,3,Standard,34.81
3,4,Economy SUV,37.48


In [62]:
df_dim_vehicle = df_vehicle.merge(df_vehicle_type, 
                                                                    left_on='vehicle_type_id',
                                                                    right_on='id',
                                                                    how='left',
                                                                    suffixes=('_vehicle','_type'))

df_dim_vehicle.head()

Unnamed: 0,id_vehicle,brand,model,model_year,mileage,color,vehicle_type_id,current_location_id,id_type,name,rental_value
0,1,Nissan,Versa,2016,65956,white,1,1,1,Economy,26.77
1,2,Mitsubishi,Mirage,2017,55864,light blue,1,6,1,Economy,26.77
2,3,Chevrolet,Cruze,2017,45796,dark gray,2,5,2,Intermediate,29.45
3,4,Hyundai,Elantra,2018,35479,black,2,1,2,Intermediate,29.45
4,5,Volkswagen,Jetta,2019,2032,light gray,3,3,3,Standard,34.81


In [64]:
df_ren_vehicle = df_ren_loc.merge(df_dim_vehicle, 
                                                                    left_on='vehicle_type_id',
                                                                    right_on='vehicle_type_id',
                                                                    how='left',
                                                                    suffixes=('_ren','_vehicle'))

df_ren_vehicle.head()

Unnamed: 0,id_ren,start_date,end_date,customer_id,vehicle_type_id,fuel_option_id,pickup_location_id,drop_off_location_id,start_date_key,end_date_key,...,id_vehicle,brand,model,model_year,mileage,color,current_location_id,id_type,name,rental_value
0,1,2018-07-14,2018-07-23,1,2,1,3,5,20180714,20180723,...,3,Chevrolet,Cruze,2017,45796,dark gray,5,2,Intermediate,29.45
1,1,2018-07-14,2018-07-23,1,2,1,3,5,20180714,20180723,...,4,Hyundai,Elantra,2018,35479,black,1,2,Intermediate,29.45
2,2,2018-07-10,2018-07-12,2,1,2,1,2,20180710,20180712,...,1,Nissan,Versa,2016,65956,white,1,1,Economy,26.77
3,2,2018-07-10,2018-07-12,2,1,2,1,2,20180710,20180712,...,2,Mitsubishi,Mirage,2017,55864,light blue,6,1,Economy,26.77
4,3,2018-06-15,2018-07-20,3,1,3,4,6,20180615,20180720,...,1,Nissan,Versa,2016,65956,white,1,1,Economy,26.77


#### Date dim

In [65]:
df_ren_date1 = df_ren_vehicle.merge(df_dim_date, 
                                                                    left_on='start_date_key',
                                                                    right_on='date_key',
                                                                    how='left',
                                                                    suffixes=('_ren','_date'))

df_ren_date1.head()

Unnamed: 0,id_ren,start_date,end_date,customer_id,vehicle_type_id,fuel_option_id,pickup_location_id,drop_off_location_id,start_date_key,end_date_key,...,id_type,name,rental_value,date_key,full_date,day_of_week,day_num_in_month,day_name,quarter,year
0,1,2018-07-14,2018-07-23,1,2,1,3,5,20180714,20180723,...,2,Intermediate,29.45,20180714,2018-07-14,6,14,Saturday,3,2018
1,1,2018-07-14,2018-07-23,1,2,1,3,5,20180714,20180723,...,2,Intermediate,29.45,20180714,2018-07-14,6,14,Saturday,3,2018
2,2,2018-07-10,2018-07-12,2,1,2,1,2,20180710,20180712,...,1,Economy,26.77,20180710,2018-07-10,2,10,Tuesday,3,2018
3,2,2018-07-10,2018-07-12,2,1,2,1,2,20180710,20180712,...,1,Economy,26.77,20180710,2018-07-10,2,10,Tuesday,3,2018
4,3,2018-06-15,2018-07-20,3,1,3,4,6,20180615,20180720,...,1,Economy,26.77,20180615,2018-06-15,5,15,Friday,2,2018


In [67]:
df_ren_date2 = df_ren_date1.merge(df_dim_date, 
                                                                    left_on='start_date_key',
                                                                    right_on='date_key',
                                                                    how='left',
                                                                    suffixes=('_ren','_date'))

df_ren_date2.head()

Unnamed: 0,id_ren,start_date,end_date,customer_id,vehicle_type_id,fuel_option_id,pickup_location_id,drop_off_location_id,start_date_key,end_date_key,...,day_name_ren,quarter_ren,year_ren,date_key_date,full_date_date,day_of_week_date,day_num_in_month_date,day_name_date,quarter_date,year_date
0,1,2018-07-14,2018-07-23,1,2,1,3,5,20180714,20180723,...,Saturday,3,2018,20180714,2018-07-14,6,14,Saturday,3,2018
1,1,2018-07-14,2018-07-23,1,2,1,3,5,20180714,20180723,...,Saturday,3,2018,20180714,2018-07-14,6,14,Saturday,3,2018
2,2,2018-07-10,2018-07-12,2,1,2,1,2,20180710,20180712,...,Tuesday,3,2018,20180710,2018-07-10,2,10,Tuesday,3,2018
3,2,2018-07-10,2018-07-12,2,1,2,1,2,20180710,20180712,...,Tuesday,3,2018,20180710,2018-07-10,2,10,Tuesday,3,2018
4,3,2018-06-15,2018-07-20,3,1,3,4,6,20180615,20180720,...,Friday,2,2018,20180615,2018-06-15,5,15,Friday,2,2018
