In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sshtunnel import SSHTunnelForwarder
import psycopg2 as psy

In [2]:
import math

In [3]:
def get_conn(SSH_required,key_path):   #for getting a connection as a result

    db='datawarehouse'
    DB_HOST='datawarehouse.cdgpvetprks3.ap-south-1.rds.amazonaws.com'
    conn = []
    if SSH_required == 'Yes':
        SSH_HOST='ec2-15-206-161-154.ap-south-1.compute.amazonaws.com'
        #LOCALHOST="0.0.0.0"
        ssh_tunnel= SSHTunnelForwarder(
                (SSH_HOST, 22),
                ssh_username="ec2-user",
                ssh_private_key= key_path,
                ssh_private_key_password= "",
                remote_bind_address=(DB_HOST, 5432),
                local_bind_address=('127.0.0.1', 0)
        )
        print('Tunnel Started')
        ssh_tunnel.start()
        conn = psy.connect(
            host=ssh_tunnel.local_bind_host,
            port=ssh_tunnel.local_bind_port,
            user='postgres',
            password= "Simply1234",
            database='postgres')
        print('Connection Made')
        return conn
    else:
        conn = psy.connect(
            host = DB_HOST,
            port = 5432,
            user = 'postgres',
            password= "Simply1234",
            database='postgres')
        print('Connection Made')
        return conn

In [4]:
def get_df_from_sql(SSH_required, query,key_path):   #for getting a datafarame as a result

    db='datawarehouse'
    DB_HOST='datawarehouse.cdgpvetprks3.ap-south-1.rds.amazonaws.com'
    conn = None
    if SSH_required == 'Yes':
        SSH_HOST='ec2-15-206-161-154.ap-south-1.compute.amazonaws.com'
        #LOCALHOST="0.0.0.0"
        ssh_tunnel= SSHTunnelForwarder(
                (SSH_HOST, 22),
                ssh_username="ec2-user",
                ssh_private_key= key_path,
                ssh_private_key_password= "",
                remote_bind_address=(DB_HOST, 5432),
                local_bind_address=('127.0.0.1', 0)
        )
        # ssh_tunnel._server_list[0].block_on_close = False
        ssh_tunnel.start()
        conn = psy.connect(
            host=ssh_tunnel.local_bind_host,
            port=ssh_tunnel.local_bind_port,
            user='postgres',
            password= "Simply1234",
            database='postgres')
        df_results = pd.read_sql(query, conn)
        conn.close()
        ssh_tunnel.stop()
        return df_results
    else:
        conn = psy.connect(
            host = DB_HOST,
            port = 5432,
            user = 'postgres',
            password= "Simply1234",
            database='postgres')
        df_results = pd.read_sql(query, conn)
        conn.close()
        return df_results

In [5]:
# Usage with the actual path to the private key
SSH_required = 'Yes'
key_path = '/Users/rajatsansaniwal/Documents/tunnel-ssh .cer'
query = """with ops_main as (
    select
        awb
    ,   pickuptime
    from public.ops_main
    where 1=1
    and pickuptime is not null
    and shipping_partner = 'Hyperlocal'
    and date_trunc('day', pickuptime) = date_trunc('day', now() + interval'5.5 hours')
)
,
shipment_order_details as (
    select
        awb
    ,   warehouse_id
    from public.shipment_order_details
)
,
base as (
    select
        s.warehouse_id
    ,   o.pickuptime
    ,   o.awb
    from ops_main o
    left join shipment_order_details s on o.awb = s.awb
    order by 1, 2
)

select * from base"""

# Establish a connection
conn = get_conn(SSH_required, key_path)

# Retrieve data into a DataFrame
df = get_df_from_sql(SSH_required, query, key_path)

# Now you can perform further operations with the DataFrame 'df'
print(df)

Tunnel Started
Connection Made
      warehouse_id          pickuptime           awb
0              329 2024-07-08 10:56:17  GS1120176997
1              329 2024-07-08 10:56:45  GS1080402563
2              329 2024-07-08 10:56:56  GS2023280325
3              329 2024-07-08 10:57:07  GS1633410405
4              329 2024-07-08 10:57:30  GS1509814115
...            ...                 ...           ...
7093          2147 2024-07-08 12:00:50  GS1515106055
7094          2147 2024-07-08 12:00:57  GS2112840975
7095          2147 2024-07-08 12:01:07  GS1835884817
7096          2147 2024-07-08 12:01:20  GS1685924606
7097          2147 2024-07-08 12:01:34  GS1269768514

[7098 rows x 3 columns]


In [6]:
df['pickuptime'] = pd.to_datetime(df['pickuptime'])

results = []

for warehouse, group in df.groupby('warehouse_id'):
    group = group.reset_index(drop=True)

    pickup_start_time = group.loc[0, 'pickuptime']

    for i in range(1, len(group)):
        current_time = group.loc[i, 'pickuptime']
        previous_time = group.loc[i - 1, 'pickuptime']
        
        # Check the difference between current and previous pickuptime
        if (current_time - previous_time).total_seconds() > 1800:  # More than half hour
            # End the current pickup slot and start a new one
            results.append({
                'warehouse_id': warehouse,
                'pickup_start_time': pickup_start_time,
                'pickup_end_time': previous_time
            })
            pickup_start_time = current_time  # Update the start time for the new slot
    
    # Append the last slot for the warehouse
    results.append({
        'warehouse_id': warehouse,
        'pickup_start_time': pickup_start_time,
        'pickup_end_time': group.loc[len(group) - 1, 'pickuptime']
    })

# Create DataFrame from results list
results_df = pd.DataFrame(results)

# Calculate slot_duration in minutes
results_df['slot_duration'] = ((results_df['pickup_end_time'] - results_df['pickup_start_time']).dt.total_seconds() / 60).apply(math.ceil)

# Sort DataFrame by warehouse_id and pickup_start_time (if not already sorted)
results_df.sort_values(by=['warehouse_id', 'pickup_start_time'], inplace=True)

# Calculate gaps between pickup slots in minutes
results_df['gap_to_next'] = results_df.groupby('warehouse_id')['pickup_start_time'].shift(-1) - results_df['pickup_end_time']
results_df['gap_to_next'] = (results_df['gap_to_next'].dt.total_seconds() / 60)
results_df['gap_to_next'].fillna(0, inplace=True)
results_df['gap_to_next'] = (results_df['gap_to_next'].astype(int) + 1).replace(1, np.nan)

# Keep only the time portion (hours and minutes) for pickup_start_time and pickup_end_time
results_df['pickup_start_time'] = results_df['pickup_start_time'].dt.strftime('%H:%M')
results_df['pickup_end_time'] = results_df['pickup_end_time'].dt.strftime('%H:%M')

print(results_df)

# Save DataFrame to Excel
excel_file = 'results_df.xlsx'
results_df.to_excel(excel_file, index=False)

print(f"DataFrame saved to {excel_file}")

    warehouse_id pickup_start_time pickup_end_time  slot_duration  gap_to_next
0            329             10:56           11:17             22          NaN
1            370             03:49           03:58             10          NaN
2            403             10:42           11:13             32          NaN
3            833             08:10           08:16              6        164.0
4            833             10:59           10:59              1          NaN
..           ...               ...             ...            ...          ...
60          2069             09:21           09:23              2          NaN
61          2082             00:07           00:07              0          NaN
62          2129             11:58           12:01              3          NaN
63          2140             10:18           10:39             21          NaN
64          2147             11:58           12:01              3          NaN

[65 rows x 5 columns]
DataFrame saved to results_df

In [7]:
df['pickuptime'] = pd.to_datetime(df['pickuptime'])

def round_down_to_nearest_10min(dt):
    return dt - timedelta(minutes=dt.minute, seconds=dt.second, microseconds=dt.microsecond)

df['window'] = df['pickuptime'].apply(round_down_to_nearest_10min)

# Generate all possible 10-minute windows
start_time = df['pickuptime'].min().floor('H')
end_time = df['pickuptime'].max().ceil('H')
all_windows = pd.date_range(start=start_time, end=end_time, freq='H')

# Create a DataFrame for windows and initialize as inactive
window_df = pd.DataFrame({'window': all_windows})
window_df['active'] = False

# Add warehouse_id to windows and initialize as inactive
unique_warehouses = df['warehouse_id'].unique()
window_df = window_df.merge(pd.DataFrame({'warehouse_id': unique_warehouses}), how='cross')
window_df['active'] = False

# Mark active windows based on pickups
for warehouse in unique_warehouses:
    warehouse_windows = df[df['warehouse_id'] == warehouse]['window'].unique()
    window_df.loc[(window_df['warehouse_id'] == warehouse) & (window_df['window'].isin(warehouse_windows)), 'active'] = True

# Identify pickup start and end times
window_df['pickup_start'] = False
window_df['pickup_end'] = False

for warehouse in unique_warehouses:
    warehouse_windows = window_df[window_df['warehouse_id'] == warehouse].reset_index(drop=True)
    for i in range(len(warehouse_windows) - 1):
        current_active = warehouse_windows.iloc[i]['active']
        next_active = warehouse_windows.iloc[i + 1]['active']
        if i == 0 and current_active:
            window_df.loc[(window_df['warehouse_id'] == warehouse) & (window_df['window'] == warehouse_windows.iloc[i]['window']), 'pickup_start'] = True
        if not current_active and next_active:
            window_df.loc[(window_df['warehouse_id'] == warehouse) & (window_df['window'] == warehouse_windows.iloc[i + 1]['window']), 'pickup_start'] = True
        if current_active and not next_active:
            if i > 0:  # Ensure it's not the first window of the day
                window_df.loc[(window_df['warehouse_id'] == warehouse) & (window_df['window'] == warehouse_windows.iloc[i + 1]['window']), 'pickup_end'] = True


# Filter to show only start and end times
pickup_times = window_df[(window_df['pickup_start']) | (window_df['pickup_end'])]

# Display results
print(pickup_times[pickup_times['warehouse_id'] == 1615])


                 window  active  warehouse_id  pickup_start  pickup_end
515 2024-07-08 10:00:00    True          1615          True       False
615 2024-07-08 12:00:00   False          1615         False        True


In [8]:
# Initialize lists to store results
results = []

# Iterate through each warehouse_id
for warehouse in window_df['warehouse_id'].unique():
    # Filter rows for the current warehouse_id and reset index for iteration
    warehouse_windows = window_df[window_df['warehouse_id'] == warehouse].reset_index(drop=True)
    i = 0
    while i < len(warehouse_windows):
        if warehouse_windows.iloc[i]['pickup_start']:
            # Find the corresponding pickup_end
            j = i + 1
            while j < len(warehouse_windows):
                if warehouse_windows.iloc[j]['pickup_end']:
                    # Append the result for this pair of pickup_start and pickup_end
                    results.append({
                        'warehouse_id': warehouse,
                        'pickup_start_time': warehouse_windows.iloc[i]['window'],
                        'pickup_end_time': warehouse_windows.iloc[j]['window']
                    })
                    i = j + 1  # Move to the next pickup_start
                    break
                j += 1
        else:
            i += 1

# Create DataFrame from results list
results_df = pd.DataFrame(results)

# Step 1: Calculate slot_duration in minutes
results_df['slot_duration'] = (results_df['pickup_end_time'] - results_df['pickup_start_time']).dt.total_seconds() / 60

# Sort DataFrame by warehouse_id and pickup_start_time (if not already sorted)
results_df.sort_values(by=['warehouse_id', 'pickup_start_time'], inplace=True)

# Initialize gap_to_next column with NaN
results_df['gap_to_next'] = pd.NaT

# Iterate over each unique warehouse_id
unique_warehouses = results_df['warehouse_id'].unique()
for warehouse_id in unique_warehouses:
    # Select rows for the current warehouse_id
    warehouse_df = results_df[results_df['warehouse_id'] == warehouse_id].copy()
    
    # Calculate gaps between pickup slots in minutes
    warehouse_df['gap_to_next'] = (warehouse_df['pickup_start_time'].shift(-1) - warehouse_df['pickup_end_time']).dt.total_seconds() / 60
    warehouse_df.loc[warehouse_df.index[-1], 'gap_to_next'] = pd.NaT  # Set NaN for the last row
    
    # Update results_df with calculated gaps for the current warehouse_id
    results_df.loc[results_df['warehouse_id'] == warehouse_id, 'gap_to_next'] = warehouse_df['gap_to_next']


# Print the result
print(results_df)

# Save DataFrame to Excel
excel_file = 'results_df.xlsx'
results_df.to_excel(excel_file, index=False)

print(f"DataFrame saved to {excel_file}")


KeyboardInterrupt: 

In [None]:
# Initialize lists to store results
results = []

# Group by warehouse_id and iterate over each group
for warehouse, warehouse_windows in window_df.groupby('warehouse_id'):
    # Reset index for iteration
    warehouse_windows = warehouse_windows.reset_index(drop=True)
    i = 0
    while i < len(warehouse_windows):
        if warehouse_windows.iloc[i]['pickup_start']:
            # Find the corresponding pickup_end
            j = i + 1
            while j < len(warehouse_windows):
                if warehouse_windows.iloc[j]['pickup_end']:
                    # Append the result for this pair of pickup_start and pickup_end
                    results.append({
                        'warehouse_id': warehouse,
                        'pickup_start_time': warehouse_windows.iloc[i]['window'],
                        'pickup_end_time': warehouse_windows.iloc[j]['window']
                    })
                    i = j + 1  # Move to the next pickup_start
                    break
                j += 1
        else:
            i += 1

# Create DataFrame from results list
results_df = pd.DataFrame(results)

# Step 1: Calculate slot_duration in minutes
results_df['slot_duration'] = (results_df['pickup_end_time'] - results_df['pickup_start_time']).dt.total_seconds() / 60

# Sort DataFrame by warehouse_id and pickup_start_time (if not already sorted)
results_df.sort_values(by=['warehouse_id', 'pickup_start_time'], inplace=True)

# Calculate gaps between pickup slots in minutes
results_df['gap_to_next'] = results_df.groupby('warehouse_id')['pickup_start_time'].shift(-1) - results_df['pickup_end_time']
results_df['gap_to_next'] = results_df['gap_to_next'].dt.total_seconds() / 60

# Save DataFrame to Excel
excel_file = 'results_df.xlsx'
results_df.to_excel(excel_file, index=False)

print(f"DataFrame saved to {excel_file}")


KeyboardInterrupt: 