# Project 1 - Data Warehousing (Data Generation) | Marco Rossini (s291482)

## Install the required packages

In [None]:
pip install italian_holidays

In [None]:
pip install faker

## Import the required packages

In [None]:
import random

import faker
import italian_holidays
import numpy as np
import pandas as pd
import urllib.request

from faker import Faker
from italian_holidays import italian_holidays

## Define the required variables

In [None]:
holidays = italian_holidays()

restaurants_number = 5
dates_number = 1000
deliveries_number = 100000

f = Faker('it_IT')

names_list = urllib.request.urlopen("https://marcorossini.altervista.org/projects/master/advanced-databases/project-work-1/names.txt").read().decode('cp1252').splitlines()
addresses_list = urllib.request.urlopen("https://marcorossini.altervista.org/projects/master/advanced-databases/project-work-1/addresses.txt").read().decode('cp1252').splitlines()
addresses = []
for _ in range(restaurants_number):
    addresses.append(random.choice(addresses_list))

dates = [f.date_between(start_date='-500d', end_date='today') for _ in range(dates_number)]
categories = ['Indian', 'Italian', 'Pizzeria', 'Chinese/Japanese', 'Other']
payment_methods = ['Bancomat', 'Credit card', 'Cash', 'Satispay']
transport_modes = ['Bike', 'Scooter', 'Car']

## Define the required functions

In [None]:
def flatten(lst):
    for el in lst:
        if isinstance(el, list):
            yield from el
        else:
            yield el


def get_semester(date):
    month = date.month

    if 0 < month < 6:
        return "1"
    return "2"


def run():
    # Restaurant
    dim_restaurant = pd.DataFrame({
        'RestaurantID': list(range(restaurants_number)),
        'Restaurant': [names_list[i] for i in range(restaurants_number)],
        'Address': [" ".join(f.address().split()[:3]) for _ in range(restaurants_number)],
        'City': [addresses[i].split(";")[0] for i in range(restaurants_number)],
        'Province': [addresses[i].split(";")[2] for i in range(restaurants_number)],
        'Region': [addresses[i].split(";")[1] for i in range(restaurants_number)]
    })
    dim_restaurant.to_csv("dim_restaurant.csv", index=False)

    # Time
    dim_time = pd.DataFrame({
        'TimeID': list(range(dates_number)),
        'Date': [dates[i] for i in range(dates_number)],
        'Weekday': [dates[i].strftime("%A") for i in range(dates_number)],
        'Holiday': [holidays.is_holiday(str(dates[i])) for i in range(dates_number)],
        'Month': [dates[i].strftime('%Y-%m') for i in range(dates_number)],
        'Semester': [get_semester(dates[i]) for i in range(dates_number)],
        'Year': [dates[i].year for i in range(dates_number)]
    })
    dim_time.to_csv("dim_time.csv", index=False)

    # Category
    dim_category = pd.DataFrame({
        'CategoryID': list(range(len(categories))),
        'Category': [categories[i] for i in range(len(categories))]
    })
    dim_category.to_csv("dim_category.csv", index=False)

    restaurants_numbers = dim_restaurant.copy()
    restaurants_numbers['CategoryNumber'] = np.random.randint(0, 5, restaurants_numbers.shape[0])
    restaurants_numbers.to_csv("restaurants_numbers.csv", index=False)
    restaurants_sample = restaurants_numbers.sample(n=deliveries_number, replace=True)
    restaurant_ids = [row['RestaurantID'] for index, row in restaurants_sample.iterrows()]
    category_ids = [row['CategoryNumber'] for index, row in restaurants_sample.iterrows()]
    revenue = [random.randint(100, 2000) / 10 for _ in restaurant_ids]

    # Delivery
    fact_delivery = pd.DataFrame({
        'RestaurantID': [restaurant_ids[i] for i in range(len(restaurant_ids))],
        'CategoryID': [category_ids[i] for i in range(len(category_ids))],
        'TimeID': dim_time.sample(n=deliveries_number, replace=True)['TimeID'].tolist(),
        'PaymentMethod': list(
            flatten([random.choices(payment_methods, [0.2, 0.5, 0.2, 0.1]) for _ in range(deliveries_number)])),
        'TransportMode': list(
            flatten([random.choices(transport_modes, [0.5, 0.4, 0.1]) for _ in range(len(restaurant_ids))])),
        'TotalRevenue': revenue,
        'TotalDeliveryTime': [random.randint(int(r / 5), int(r / 3)) + 5 for r in revenue],
        'NumberOfDeliveries': [random.randint(int(r / 5 / 4), int(r / 3 / 2)) + 1 for r in revenue],
    }).drop_duplicates(['RestaurantID', 'TimeID', 'PaymentMethod', 'TransportMode'], keep='last')
    fact_delivery.to_csv("fact_delivery.csv", index=False)

## Run the data generation

In [None]:
run()