In [None]:
import get_data_api as gda
import data_enrichment as de
import dim_tables as dim

import clean as cl
import transformation as tr
import create_bbdd as db
import pandas as pd

## -Create db-

In [None]:
#Create ddbb and tables to fill
db.main()

## -Get data-

In [None]:
#get data from API
raw_users = gda.get_all_data('https://jsonplaceholder.typicode.com/users')

In [None]:
#get bookings data
raw_bookings = pd.read_csv('raw_bookings.csv', sep=';')

In [None]:
#create copy of raw_users
users_complete = raw_users.copy()

## -Data enrichment-

### Users data

In [None]:
#enrich the user table to match the bookings table
users_complete = de.random_users(43562, users_complete)

In [None]:
#create companies id in users df
de.create_id(users_complete, 'company_id')

In [None]:
#keep only 60% of companies
de.only_some_records(0.4, users_complete, 'company_id', 'company')

In [None]:
#create copy of raw_users_complete
raw_users_complete = users_complete.copy()

In [None]:
#cast objects columns for fill tables in sql
list_columns = ['name', 'username', 'email', 'address', 'phone', 'website', 'company']
for i in list_columns:
    raw_users_complete = tr.cast_column_to_dtype(raw_users_complete, i, str)
    

In [None]:
#insert raw_users_complete data in raw_users sqlite table
db.insert_table_sql(raw_users_complete, 'raw_users', "./dd_test.db")

### Bookings data

In [None]:
#add booking codes to all bookings
raw_bookings_complete = de.create_booking_id(raw_bookings)

In [None]:
#add users_id to all bookings
de.create_id(raw_bookings_complete, 'user_id', len(raw_users_complete))

In [None]:
#cast objects columns for fill tables in sql
list_columns = ['booking_code', 'hotel', 'arrival_date_month', 
                'meal', 'country', 'reserved_room_type','assigned_room_type',
                'reservation_status','reservation_status_date']
for i in list_columns:
    raw_bookings_complete = tr.cast_column_to_dtype(raw_bookings_complete, i, str)

In [None]:
#insert raw_users_complete data in raw_bookings sqlite table
db.insert_table_sql(raw_bookings_complete, 'raw_bookings', "./dd_test.db")

## -Save raw data-

In [None]:
#save raw data
raw_bookings_complete.to_csv('raw_bookings_extended.csv', index=False)

raw_users_complete.to_csv('raw_users_extended.csv', index=False)

In [None]:
raw_users_complete

## -Create dimension tables- 

In [None]:
#Create dimension agents
DIM_AGENTS = dim.create_dim_agent_table()

In [None]:
#insert DIM_AGENTS data in DIM_AGENTS sqlite table
db.insert_table_sql(DIM_AGENTS, 'DIM_AGENTS', "./dd_test.db")

In [None]:
#Create dimension countries
DIM_COUNTRIES = dim.create_dim_country_table()

In [None]:
#insert DIM_COUNTRIES data in DIM_COUNTRIES sqlite table
db.insert_table_sql(DIM_COUNTRIES, 'DIM_COUNTRIES', "./dd_test.db")

In [None]:
#Create dimension meals
DIM_MEALS = dim.create_dim_meal_table()

In [None]:
#insert DIM_MEALS data in DIM_MEALS sqlite table
db.insert_table_sql(DIM_MEALS, 'DIM_MEALS', "./dd_test.db")

In [None]:
#Create dimension hotels
DIM_HOTELS = dim.create_dim_hotel_table()

In [None]:
#insert DIM_HOTELS data in DIM_HOTELS sqlite table
db.insert_table_sql(DIM_HOTELS, 'DIM_HOTELS', "./dd_test.db")

In [None]:
#Create dimension users without transformation
DIM_USERS_WT = dim.create_dim_users_table(users_complete)

In [None]:
#Create dimension companies without transformation
DIM_COMPANIES_WT = dim.create_dim_companies_table(users_complete)

## -Transformation-

### Users

In [None]:
#delete unnecessary columns
DIM_USERS = tr.delete_columns(DIM_USERS_WT, ['username'])

In [None]:
#lowercase some columns
DIM_USERS = tr.lowercase(DIM_USERS, ['email', 'website'])

In [None]:
#capitalize column names and change them to standardized ones
DIM_USERS =  tr.capitalize_rename_columns(DIM_USERS, ['USER_ID', 'USER_NAME', 'USER_EMAIL','USER_ADDRESS', 'USER_PHONE', 'USER_WEBSITE', 'COMPANY_ID'])

In [None]:
#cast company_id column to int
DIM_USERS = tr.cast_column_to_dtype(DIM_USERS,'COMPANY_ID', str)

### Companies

In [None]:
#Delete rows containing NaN values
DIM_COMPANIES = tr.drop_nan_rows(DIM_COMPANIES_WT)

In [None]:
#normalize company column
DIM_COMPANIES = tr.normalize_column(DIM_COMPANIES, ['company'])

In [None]:
#delete unnecessary columns for analysis
DIM_COMPANIES = tr.delete_columns(DIM_COMPANIES, ['catchPhrase', 'bs'])

In [None]:
#capitalize column names and change them to standardized ones
DIM_COMPANIES = tr.capitalize_rename_columns(DIM_COMPANIES, columns = ['COMPANY_ID', 'USER_ID','COMPANY_NAME'])

In [None]:
#lowercase COMPANY_NAME column
DIM_COMPANIES = tr.lowercase(DIM_COMPANIES, ['COMPANY_NAME'])

In [None]:
#cast company_id column to int
DIM_COMPANIES = tr.cast_column_to_dtype(DIM_COMPANIES,'COMPANY_ID', 'Int64')

In [None]:
#sort column by company ids 
DIM_COMPANIES = DIM_COMPANIES.sort_values(by=['COMPANY_ID']).reset_index(drop=True)

In [None]:
#insert DIM_COMPANIES data in DIM_COMPANIES sqlite table
db.insert_table_sql(DIM_COMPANIES, 'DIM_COMPANIES', "./dd_test.db")

### Bookings

In [None]:
#Replace the name of the hotels by their ids
BOOKINGS = tr.map_dimension_table(raw_bookings_complete,DIM_HOTELS,'hotel','HOTEL_ID','HOTEL_NAME')

In [None]:
#cast agents column to int
BOOKINGS = tr.cast_column_to_dtype(BOOKINGS,'agent', str)

In [None]:
#create a arrival date column
tr.create_arrival_date(BOOKINGS, 'arrival_date_day_of_month', 'arrival_date_month', 'arrival_date_year')

In [None]:
#create a departure date column
tr.get_departure_date(BOOKINGS)

In [None]:
#create a reservation date column
tr.reservation_date(BOOKINGS)

In [None]:
drop_columns = ['is_canceled', 'lead_time',
       'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month',
       'stays_in_weekend_nights', 'stays_in_week_nights','is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled']

#delete unnecessary columns for analysis
BOOKINGS = tr.delete_columns(BOOKINGS, drop_columns)

In [None]:
rename_columns = ['BOOKING_CODE', 'HOTEL_ID', 'ADULTS', 'CHILDREN', 'MEAL_ID', 'COUNTRY_ID',
       'RESERVED_ROOM_TYPE_ID', 'ASSIGNED_ROOM_TYPE_ID', 'AGENT_ID',
       'STATUS', 'LAST_UPDATED_AT', 'USER_ID',
       'ARRIVAL_DATE', 'DEPARTURE_DATE', 'RESERVATION_DATE']

#capitalize column names and change them to standardized ones
BOOKINGS = tr.capitalize_rename_columns(BOOKINGS, rename_columns)

In [None]:
#cast LAST_UPDATED_AT column to datetime
BOOKINGS['LAST_UPDATED_AT'] = pd.to_datetime(BOOKINGS['LAST_UPDATED_AT'], format='%d/%m/%Y')

In [None]:
reorder_columns = [
 'BOOKING_CODE',
 'USER_ID',
 'COUNTRY_ID',
 'AGENT_ID',
 'HOTEL_ID',
 'ADULTS',
 'CHILDREN',
 'MEAL_ID',
 'RESERVED_ROOM_TYPE_ID',
 'ASSIGNED_ROOM_TYPE_ID',
 'STATUS',
 'LAST_UPDATED_AT',
 'ARRIVAL_DATE',
 'DEPARTURE_DATE',
 'RESERVATION_DATE'
]

#reorder columns
BOOKINGS = BOOKINGS[reorder_columns]

In [None]:
#insert BOOKINGS data in BOOKINGS sqlite table
db.insert_table_sql(BOOKINGS, 'BOOKINGS', "./dd_test.db")

## -Clean data- 

In [None]:
#put the prefix 'http://' in front of the websites that do not have it
DIM_USERS['USER_WEBSITE'] = DIM_USERS['USER_WEBSITE'].apply(cl.add_https)

In [None]:
#remove 'mrs.' from USER_NAME
DIM_USERS['USER_NAME'] = DIM_USERS['USER_NAME'].str.replace('mrs. ','', regex = False)

In [None]:
#cast objects columns for fill tables in sql
list_columns = ['USER_NAME', 'USER_EMAIL', 'USER_ADDRESS', 'USER_PHONE', 'USER_WEBSITE', 'COMPANY_ID']
for i in list_columns:
    raw_users_complete = tr.cast_column_to_dtype(DIM_USERS, i, str)

In [None]:
#insert DIM_USERS data in DIM_USERS sqlite table
db.insert_table_sql(DIM_USERS, 'DIM_USERS', "./dd_test.db")