# Project 1 - Data Warehousing | Marco Rossini (s291482)

## Data import and connection with the database

In [None]:
import pandas as pd

### Import the input tables

Read from URL

In [None]:
dim_restaurant = pd.read_csv("https://marcorossini.altervista.org/projects/master/advanced-databases/project-work-1/dim_restaurant.csv")
dim_time = pd.read_csv("https://marcorossini.altervista.org/projects/master/advanced-databases/project-work-1/dim_time.csv")
dim_category = pd.read_csv("https://marcorossini.altervista.org/projects/master/advanced-databases/project-work-1/dim_category.csv")
fact_delivery = pd.read_csv("https://marcorossini.altervista.org/projects/master/advanced-databases/project-work-1/fact_delivery.csv")

### Create a local SQLite database.

The `create_engine()` function takes the database as one argument. We indicate the URL as the first positional argument with connection arguments. Using the code given below, we can create a database.


In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///deliveries.db')

### Write records stored in the dataframe to the SQL database.


After creating our engine, we need to define and create our tables. We use the `to_sql` function of pandas.

In [None]:
dim_restaurant.to_sql('dim_restaurant', con=engine, if_exists="replace", index=False)
dim_time.to_sql("dim_time", con=engine, if_exists="replace", index=False)
dim_category.to_sql("dim_category", con=engine, if_exists="replace", index=False)
fact_delivery.to_sql("fact_delivery", con=engine, if_exists="replace", index=False)

In [None]:
dim_restaurant.head()

Unnamed: 0,RestaurantID,Restaurant,Address,City,Province,Region
0,0,Locanda La Lina,Canale Giovanna 386,Samo,Reggio Calabria (RC),Calabria
1,1,Ristorante I Fondi,Canale Giacinto 78,Diano d'Alba,Cuneo (CN),Piemonte
2,2,Bellezza,Via Gaspare 4,Mortara,Pavia (PV),Lombardia
3,3,Ciccio Marina,Via Corradi 10,Ciriè,Torino (TO),Piemonte
4,4,Enoteca Velia,Viale Sabatino 7,San Giacomo delle Segnate,Mantova (MN),Lombardia


In [None]:
dim_time.head()

Unnamed: 0,TimeID,Date,Weekday,Holiday,Month,Semester,Year
0,0,2020-05-30,Saturday,False,2020-05,1,2020
1,1,2019-12-30,Monday,False,2019-12,2,2019
2,2,2020-03-03,Tuesday,False,2020-03,1,2020
3,3,2021-01-02,Saturday,False,2021-01,1,2021
4,4,2020-07-13,Monday,False,2020-07,2,2020


In [None]:
dim_category.head()

Unnamed: 0,CategoryID,Category
0,0,Indian
1,1,Italian
2,2,Pizzeria
3,3,Chinese/Japanese
4,4,Other


In [None]:
fact_delivery.head()

Unnamed: 0,RestaurantID,CategoryID,TimeID,PaymentMethod,TransportMode,TotalRevenue,TotalDeliveryTime,NumberOfDeliveries
0,4,1,349,Satispay,Scooter,30.3,12,2
1,0,4,143,Cash,Scooter,67.9,24,6
2,4,1,699,Bancomat,Car,107.5,31,9
3,2,1,958,Satispay,Bike,47.7,17,8
4,4,1,920,Satispay,Car,88.7,34,11



## Querying the Data Warehouse

### Query A

*\"For each day, select the total revenue and the average revenue per delivery. Sort the result by date.\"*

In [None]:
query_a = "SELECT Date, SUM(TotalRevenue) as TotalRevenue, SUM(TotalRevenue) / SUM(NumberOfDeliveries) as AverageRevenuePerDelivery \
          FROM fact_delivery F, dim_time T \
          WHERE F.TimeID = T.TimeID \
          GROUP BY Date \
          ORDER BY Date"

pd.read_sql_query(query_a, engine)

Unnamed: 0,Date,TotalRevenue,AverageRevenuePerDelivery
0,2019-10-25,15572.5,8.542238
1,2019-10-26,4603.8,8.686415
2,2019-10-27,8153.7,8.882026
3,2019-10-29,3656.5,8.235360
4,2019-10-30,12040.1,8.686941
...,...,...,...
416,2021-02-28,12791.8,8.773525
417,2021-03-01,8874.3,8.443673
418,2021-03-02,3803.7,8.949882
419,2021-03-04,8248.3,8.646017


### Query B


*\"Select the yearly revenue and the total number of deliveries for each restaurant. Sort the results by descending yearly revenue.\"*

In [None]:
query_b = "SELECT Restaurant, Year, SUM(TotalRevenue) as YearlyRevenue, SUM(NumberOfDeliveries) as TotalNumberOfDeliveries \
          FROM fact_delivery F, dim_time T, dim_restaurant R \
          WHERE F.RestaurantID = R.RestaurantID AND F.TimeID = T.TimeID \
          GROUP BY Restaurant, Year \
          ORDER BY YearlyRevenue DESC"

pd.read_sql_query(query_b, engine)

Unnamed: 0,Restaurant,Year,YearlyRevenue,TotalNumberOfDeliveries
0,Locanda La Lina,2020,619678.9,69868
1,Ciccio Marina,2020,616698.9,70120
2,Enoteca Velia,2020,613680.8,69160
3,Bellezza,2020,612936.5,69648
4,Ristorante I Fondi,2020,608939.0,68726
5,Locanda La Lina,2019,121695.3,13701
6,Bellezza,2019,120332.0,13698
7,Enoteca Velia,2019,119796.6,13518
8,Ciccio Marina,2019,119220.5,13421
9,Ristorante I Fondi,2019,118782.7,13175


### Query C


*\"Separately for each transport mode and year, select the total number of deliveries and the average time for delivery.\"*

In [None]:
query_c = "SELECT TransportMode, Year, SUM(NumberOfDeliveries) as TotalNumberOfDeliveries, SUM(TotalDeliveryTime) / SUM(NumberOfDeliveries) as AverageDeliveryTime \
          FROM fact_delivery F, dim_time T \
          WHERE F.TimeID = T.TimeID \
          GROUP BY TransportMode, Year"

pd.read_sql_query(query_c, engine)

Unnamed: 0,TransportMode,Year,TotalNumberOfDeliveries,AverageDeliveryTime
0,Bike,2019,28225,2
1,Bike,2020,145404,2
2,Bike,2021,24967,2
3,Car,2019,12921,2
4,Car,2020,66158,2
5,Car,2021,10881,2
6,Scooter,2019,26367,2
7,Scooter,2020,135960,2
8,Scooter,2021,23525,2


### Query D


*\"Consider only the deliveries with “bike” as transport mode. Separately for each month and restaurant, select the total revenue and the average delivery time.\"*

In [None]:
query_d = "SELECT Month, Restaurant, SUM(TotalRevenue) as TotalRevenue, SUM(TotalDeliveryTime) / SUM(NumberOfDeliveries) as AverageDeliveryTime \
          FROM fact_delivery F, dim_time T, dim_restaurant R \
          WHERE F.TimeID = T.TimeID AND F.RestaurantID = R.RestaurantID AND F.TransportMode = 'Bike' \
          GROUP BY Month, Restaurant"

pd.read_sql_query(query_d, engine)

Unnamed: 0,Month,Restaurant,TotalRevenue,AverageDeliveryTime
0,2019-10,Bellezza,5465.2,2
1,2019-10,Ciccio Marina,5049.2,2
2,2019-10,Enoteca Velia,4717.5,2
3,2019-10,Locanda La Lina,4939.9,2
4,2019-10,Ristorante I Fondi,5063.9,2
...,...,...,...,...
85,2021-03,Bellezza,3321.1,2
86,2021-03,Ciccio Marina,2234.9,2
87,2021-03,Enoteca Velia,1936.1,2
88,2021-03,Locanda La Lina,2571.3,2


### Query E


*\"Separately for date and transport mode, select the total revenue and the maximum delivery time.\"*

In [None]:
query_e = "SELECT Date, TransportMode, SUM(TotalRevenue) as TotalRevenue, MAX(TotalDeliveryTime) as MaxDeliveryTime \
          FROM fact_delivery F, dim_time T \
          WHERE F.TimeID = T.TimeID \
          GROUP BY Date, TransportMode"

pd.read_sql_query(query_e, engine)

Unnamed: 0,Date,TransportMode,TotalRevenue,MaxDeliveryTime
0,2019-10-25,Bike,6228.5,65
1,2019-10-25,Car,2673.0,67
2,2019-10-25,Scooter,6671.0,64
3,2019-10-26,Bike,2098.6,58
4,2019-10-26,Car,1037.6,55
...,...,...,...,...
1258,2021-03-04,Car,1407.0,59
1259,2021-03-04,Scooter,3569.6,67
1260,2021-03-06,Bike,3634.1,62
1261,2021-03-06,Car,1997.7,62


### Query F


*\"Separately for each month, select the total revenue and the average daily revenue.\"*

In [None]:
query_f = "SELECT Month, SUM(TotalRevenue) as TotalRevenue, SUM(TotalRevenue) / COUNT(DISTINCT Date) as AverageDailyRevenue \
          FROM fact_delivery F, dim_time T \
          WHERE F.TimeID = T.TimeID \
          GROUP BY Month"

pd.read_sql_query(query_f, engine)

Unnamed: 0,Month,TotalRevenue,AverageDailyRevenue
0,2019-10,59999.6,9999.933333
1,2019-11,235066.2,9794.425
2,2019-12,304761.3,11287.455556
3,2020-01,284280.8,10933.876923
4,2020-02,173726.1,8272.671429
5,2020-03,301780.7,10406.231034
6,2020-04,277653.2,9574.248276
7,2020-05,230276.7,8856.796154
8,2020-06,267351.6,9219.02069
9,2020-07,212320.1,10110.480952


## Join for Google Data Studio

In [None]:
query_join = "SELECT Restaurant, Address, City, Province, Region, Date, Weekday, Holiday, Month, Semester, Year, Category, PaymentMethod, TransportMode, TotalRevenue, TotalDeliveryTime, NumberOfDeliveries \
              FROM fact_delivery F, dim_restaurant R, dim_time T, dim_category C \
              WHERE F.RestaurantID = R.RestaurantID AND F.CategoryID = C.CategoryID AND F.TimeID = T.TimeID"

In [None]:
df=pd.read_sql_query(query_join, engine)

In [None]:
df.head()

Unnamed: 0,Restaurant,Address,City,Province,Region,Date,Weekday,Holiday,Month,Semester,Year,Category,PaymentMethod,TransportMode,TotalRevenue,TotalDeliveryTime,NumberOfDeliveries
0,Enoteca Velia,Viale Sabatino 7,San Giacomo delle Segnate,Mantova (MN),Lombardia,2020-09-08,Tuesday,0,2020-09,2,2020,Italian,Satispay,Scooter,30.3,12,2
1,Locanda La Lina,Canale Giovanna 386,Samo,Reggio Calabria (RC),Calabria,2020-03-17,Tuesday,0,2020-03,1,2020,Other,Cash,Scooter,67.9,24,6
2,Enoteca Velia,Viale Sabatino 7,San Giacomo delle Segnate,Mantova (MN),Lombardia,2020-03-19,Thursday,0,2020-03,1,2020,Italian,Bancomat,Car,107.5,31,9
3,Bellezza,Via Gaspare 4,Mortara,Pavia (PV),Lombardia,2020-07-22,Wednesday,0,2020-07,2,2020,Italian,Satispay,Bike,47.7,17,8
4,Enoteca Velia,Viale Sabatino 7,San Giacomo delle Segnate,Mantova (MN),Lombardia,2021-01-28,Thursday,0,2021-01,1,2021,Italian,Satispay,Car,88.7,34,11


In [None]:
df.to_csv("./deliveries_joined.csv", index=False)

The joined table is used in Google Data Studio for visualization purposes. The related report is available at this [link](https://datastudio.google.com/reporting/e0fd2c8b-c713-4cf3-a9fe-705bc2f8ca6a).