In [1]:
import gc
import warnings
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm

from pyhive import presto

In [2]:
warnings.filterwarnings('ignore')

## Riyadh (MOP, RH intents)

### Create Shape

In [None]:
from geo_tool import create_polygons
from geo_tool.preprocess import GeoAggregator

In [None]:
# not the settings that were used initially

setup = {
    'lat_min': 24.4,# 24.322995193443617,
    'lon_min': 46.4, # 46.31937651439429,
    'lat_max': 25.102463220881223,
    'lon_max': 47.08501151812489
}

In [None]:
squares = create_polygons(
    setup['lat_min'],
    setup['lon_min'],
    4000,
    36//2
)

In [None]:
squares.geometry.iloc[-1].wkt

In [None]:
# Starting point is left low

squares.explore()

In [None]:
squares.to_parquet('data/riyadh_squares.pq')

In [None]:
n = int(np.sqrt(len(squares)))

ids_mat = np.rot90(squares.square_id.values.reshape(n, n))

In [None]:
ids_mat

In [None]:
with open('data/riyadh_squares.npy', 'wb') as file:
    np.save(file, ids_mat)

In [None]:
pd_squares = pd.DataFrame({'square_id': squares.square_id.values})

In [None]:
pd_squares.to_parquet('data/pd_squares.pq')

### Load Data

Warning: this takes a lot of time!

In [None]:
from queries import get_locations_data

In [None]:
conn = presto.connect(
    host='presto-python-r-script-cluster.careem-engineering.com',
    username='presto_python_r',
    port=8080
)

In [None]:
date_range = [str(x).split(' ')[0] for x in pd.date_range(start='2023-05-01', end='2023-05-01', freq='D')]  # start='2022-09-01'

len(date_range)

In [None]:
df_list = []
intents_list = []
orders_geo_agg = GeoAggregator(squares, setup)



for date in tqdm(date_range):
    sub = pd.read_sql(sql=get_locations_data(date, 3), con=conn)
    intents_list.append(sub)
    sub = orders_geo_agg.preprocess(sub, resampling_window='1H')
    df_list.append(sub)

intents_df = pd.concat(intents_list, ignore_index=True)
snapshots = pd.concat(df_list, ignore_index=True).sort_values(['ts', 'square_id']).reset_index(drop=True)


del sub
del df_list
del intents_list

_ = gc.collect()

In [None]:
len(snapshots)

In [None]:
snapshots.head(20)

In [None]:
intents_df.to_parquet('data/riyadh_data_med.pq')
snapshots.to_parquet('data/riyadh_snap_med.pq')

## Amman (MOT, Food orders)

In [3]:
from geo_tool import create_polygons
from geo_tool.preprocess import GeoAggregator

In [4]:
setup = {
    'lat_min': 31.79,
    'lon_min': 35.77,
    'lat_max': 32.1,
    'lon_max': 36.05
}

In [5]:
squares = create_polygons(setup['lat_min'], setup['lon_min'], 2000, 18)

In [6]:
squares.explore()

In [7]:
squares.geometry.iloc[-1].wkt

'POLYGON ((36.07542552865018 32.04924091963701, 36.09339183433257 32.04924091963701, 36.09339183433257 32.064467756048046, 36.07542552865018 32.064467756048046, 36.07542552865018 32.04924091963701))'

In [8]:
squares.to_parquet('data/amman_squares.pq')

In [9]:
n = int(np.sqrt(len(squares)))
ids_mat = np.rot90(squares.square_id.values.reshape(n, n))

with open('data/amman_squares.npy', 'wb') as file:
    np.save(file, ids_mat)

### Load Data

In [10]:
from queries import get_food_demand

In [11]:
conn = presto.connect(
    host='presto-python-r-script-cluster.careem-engineering.com',
    username='presto_python_r',
    port=8080
)

In [13]:
data = pd.read_sql(sql=get_food_demand('2022-11-01', '2023-06-10', 47), con=conn)

In [14]:
data = (
    data[
        (data.pickup_latitude > 0) &
        (data.pickup_longitude > 0 ) &
        (data.drop_off_latitude > 0) &
        (data.drop_off_longitude > 0)
    ]
)

In [15]:
data.to_parquet('data/amman.pq')

In [16]:
df = data[(data.order_type == 'food') & (data.order_status == 'delivered')].copy()
df = df[['ts', 'customer_id', 'pickup_latitude', 'pickup_longitude']]\
    .rename(columns={'pickup_latitude': 'latitude', 'pickup_longitude': 'longitude'})\
    .assign(orders=1)\
    .dropna(how='any')

In [17]:
orders_geo_agg = GeoAggregator(squares, setup, 'customer_id', 'orders')
df_agg = orders_geo_agg.preprocess(df, resampling_window='1H')

In [18]:
df_agg.rename(columns={'orders': 'intents'})\
    .sort_values(['ts', 'square_id'])\
    .to_parquet('data/amman_data.pq')