In [1]:
from sshtunnel import SSHTunnelForwarder
import psycopg2 as psy
import pandas as pd
from IPython.display import FileLink
import geopandas as gpd
import plotly.express as px
from dash import Dash, dcc, html, Input, Output
import paramiko
from io import StringIO
from shapely.geometry import MultiPoint, MultiPolygon
from sklearn import preprocessing, cluster
import scipy
import scipy.cluster
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from shapely.ops import unary_union
import calendar
from datetime import datetime



In [2]:
def get_conn(SSH_required,key_path):   #for getting a connection as a result

    db='datawarehouse'
    DB_HOST='datawarehouse.cdgpvetprks3.ap-south-1.rds.amazonaws.com'
    conn = []
    if SSH_required == 'Yes':
        SSH_HOST='ec2-15-206-161-154.ap-south-1.compute.amazonaws.com'
        #LOCALHOST="0.0.0.0"
        ssh_tunnel= SSHTunnelForwarder(
                (SSH_HOST, 22),
                ssh_username="ec2-user",
                ssh_private_key= key_path,
                ssh_private_key_password= "",
                remote_bind_address=(DB_HOST, 5432),
                local_bind_address=('127.0.0.1', 0)
        )
        print('Tunnel Started')
        ssh_tunnel.start()
        conn = psy.connect(
            host=ssh_tunnel.local_bind_host,
            port=ssh_tunnel.local_bind_port,
            user='postgres',
            password= "Simply1234",
            database='postgres')
        print('Connection Made')
        return conn
    else:
        conn = psy.connect(
            host = DB_HOST,
            port = 5432,
            user = 'postgres',
            password= "Simply1234",
            database='postgres')
        print('Connection Made')
        return conn

In [3]:
def get_df_from_sql(SSH_required, query,key_path):   #for getting a datafarame as a result

    db='datawarehouse'
    DB_HOST='datawarehouse.cdgpvetprks3.ap-south-1.rds.amazonaws.com'
    conn = None
    if SSH_required == 'Yes':
        SSH_HOST='ec2-15-206-161-154.ap-south-1.compute.amazonaws.com'
        #LOCALHOST="0.0.0.0"
        ssh_tunnel= SSHTunnelForwarder(
                (SSH_HOST, 22),
                ssh_username="ec2-user",
                ssh_private_key= key_path,
                ssh_private_key_password= "",
                remote_bind_address=(DB_HOST, 5432),
                local_bind_address=('127.0.0.1', 0)
        )
        # ssh_tunnel._server_list[0].block_on_close = False
        ssh_tunnel.start()
        conn = psy.connect(
            host=ssh_tunnel.local_bind_host,
            port=ssh_tunnel.local_bind_port,
            user='postgres',
            password= "Simply1234",
            database='postgres')
        df_results = pd.read_sql(query, conn)
        conn.close()
        ssh_tunnel.stop()
        return df_results
    else:
        conn = psy.connect(
            host = DB_HOST,
            port = 5432,
            user = 'postgres',
            password= "Simply1234",
            database='postgres')
        df_results = pd.read_sql(query, conn)
        conn.close()
        return df_results

In [4]:
# Usage with the actual path to the private key
SSH_required = 'Yes'
key_path = '/Users/rajatsansaniwal/Documents/tunnel-ssh .cer'
query = "select case when shipping_city = 'NCR' then 'Delhi' else shipping_city end as shipping_city, shipping_pincode, count(*) as orders from public.ops_main where date_trunc('month', created_date) = date_trunc('month', now()) and shipping_partner = 'Hyperlocal' group by shipping_city, shipping_pincode;"

# Establish a connection
conn = get_conn(SSH_required, key_path)

# Retrieve data into a DataFrame
df = get_df_from_sql(SSH_required, query, key_path)

# Now you can perform further operations with the DataFrame 'df'
# print(df)

Tunnel Started
Connection Made


In [7]:
# Reading GeoJSON
gdf_geojson = gpd.read_file("/Users/rajatsansaniwal/git_geoboards/geoboards/India_Pincodes/india_pincodes.shp")


# If the current CRS is geographic, re-project to UTM (EPSG:32644)
if gdf_geojson.crs.is_geographic:
    gdf_geojson = gdf_geojson.to_crs('EPSG:32644')



In [8]:
print(gdf_geojson)

      pincode      state   district    officename officetype  \
0      321023  Rajasthan  Bharatpur       Jurhera        S.O   
1      322236  Rajasthan    Karauli      Dhindora        S.O   
2      321024  Rajasthan  Bharatpur         Sikri        S.O   
3      322252  Rajasthan    Karauli        Suroth        S.O   
4      321201  Rajasthan  Bharatpur        Kumher        S.O   
...       ...        ...        ...           ...        ...   
19923  321001  Rajasthan  Bharatpur     Bharatpur        H.O   
19924  321006  Rajasthan      Alwar  Kherli Mandi        B.O   
19925  322234  Rajasthan    Karauli         Kheda        S.O   
19926  321022  Rajasthan  Bharatpur      Kamanlsg        S.O   
19927  365450    Gujarat     Amreli      Kunkavav        S.O   

                                                geometry  
0      POLYGON ((119407.644 3076831.433, 119463.598 3...  
1      POLYGON ((109494.095 2974773.564, 109195.126 2...  
2      POLYGON ((98855.834 3060564.045, 98838.681 306.

In [9]:
# Calculating the areas of pincodes
gdf_geojson['area'] = gdf_geojson['geometry'].area / 10**6

# Calculating centroids of pincodes
gdf_geojson = gdf_geojson.to_crs(epsg=4326)

gdf_geojson['latitude'] = gdf_geojson['geometry'].centroid.y
gdf_geojson['longitude'] = gdf_geojson['geometry'].centroid.x

# print(gdf_geojson.head())

In [10]:
# Ensure the pincode column datatype is consistent
df['shipping_pincode'] = df['shipping_pincode'].astype(str)
gdf_geojson['pincode'] = gdf_geojson['pincode'].astype(str)

# Merge GeoDataFrame and DataFrame
merged_gdf = gdf_geojson.merge(df, left_on='pincode', right_on='shipping_pincode')


# Orders per area
merged_gdf['orders per sq km'] = merged_gdf['orders'] / merged_gdf['area']
merged_gdf = merged_gdf[merged_gdf['orders per sq km'] > 5]


clustered_dff = pd.DataFrame()
k = 10

for city, city_data in merged_gdf.groupby("shipping_city"):
    # Check the number of samples in the city_data
    if len(city_data) >= k:  #'k' is the desired number of clusters
        # Selecting features for clustering, including 'latitude', 'longitude', and 'orders per sq km'
#         X = city_data[["latitude", "longitude", "orders per sq km"]]
        X = city_data[["latitude", "longitude"]]
        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Apply KMeans clustering
        model = cluster.KMeans(n_clusters=k, init='k-means++')
        dtf_X = X.copy()
        dtf_X["cluster"] = model.fit_predict(X_scaled) + 1  # Add 1 to make clusters start from 1

        # Find centroids
        closest, distances = scipy.cluster.vq.vq(model.cluster_centers_, X_scaled)
        dtf_X["centroids"] = 0
        for i in closest:
            dtf_X["centroids"].iloc[i] = 1

        # Update 'cluster' and 'centroids' columns in original city_data
        city_data[["cluster", "centroids"]] = dtf_X[["cluster", "centroids"]]

        # Concatenate the clustered city_data to the final DataFrame
        clustered_dff = pd.concat([clustered_dff, city_data])
        
clustered_dff['current_date'] = datetime.now()
_, last_day_of_month = calendar.monthrange(clustered_dff['current_date'].dt.year.iloc[0], clustered_dff['current_date'].dt.month.iloc[0])
clustered_dff['gone_days'] = clustered_dff['current_date'].dt.day
clustered_dff['drr'] = clustered_dff['orders'] / clustered_dff['gone_days']

print(clustered_dff)

    pincode        state   district                  officename officetype  \
21   560056    Karnataka  Bangalore     Bnagalore Viswavidalaya        S.O   
43   560011    Karnataka  Bangalore          Jayangar III Block        S.O   
44   560002    Karnataka  Bangalore  Sri Jayachamarajendra Road        S.O   
45   560017    Karnataka  Bangalore                         NAL        S.O   
46   560003    Karnataka  Bangalore                 Malleswaram        S.O   
..      ...          ...        ...                         ...        ...   
244  411017  Maharashtra       Pune               Pimpri Colony        S.O   
337  411003  Maharashtra       Pune                      Khadki        S.O   
340  411004  Maharashtra       Pune                  A.R. Shala        S.O   
480  411012  Maharashtra       Pune                Dapodi Bazar        S.O   
501  411016  Maharashtra       Pune     Shivaji Housing Society        S.O   

                                              geometry       ar

In [11]:
from shapely.ops import unary_union

cluster_boundaries_gdf = gpd.GeoDataFrame(columns=['shipping_city', 'cluster', 'orders', 'geometry'])

for (city, cluster), cluster_data in clustered_dff.groupby(['shipping_city', 'cluster']):
    if len(cluster_data) >= 1:
        cluster_multipolygon = unary_union(cluster_data['geometry'])
    
        # cluster_boundaries_gdf = cluster_boundaries_gdf.append({
        #     'shipping_city': city,
        #     'cluster': cluster,
        #     'geometry': cluster_multipolygon,
        #     'orders': cluster_data['orders'].sum(),
        #     'number of pincodes': cluster_data['shipping_pincode'].count()
        # }, ignore_index=True)
        cluster_boundaries_gdf = pd.concat([cluster_boundaries_gdf, gpd.GeoDataFrame({
            'shipping_city': city,
            'cluster': cluster,
            'geometry': [cluster_multipolygon],
            'orders': cluster_data['orders'].sum(),
            'number of pincodes': cluster_data['shipping_pincode'].count()
        })], ignore_index=True)


# Ensure the CRS of the new GeoDataFrame is set
cluster_boundaries_gdf.crs = clustered_dff.crs

# If the current CRS is geographic, re-project to UTM (EPSG:32644)
if 'EPSG:4326' in str(cluster_boundaries_gdf.crs):
    # Set the CRS to UTM (EPSG:32644)
    cluster_boundaries_gdf = cluster_boundaries_gdf.to_crs('EPSG:32644')

# Calculate the areas and add a new column 'area' to the GeoDataFrame
cluster_boundaries_gdf['area'] = cluster_boundaries_gdf['geometry'].to_crs('EPSG:32644').area / 10**6
cluster_boundaries_gdf = cluster_boundaries_gdf.to_crs('EPSG:4326')
cluster_boundaries_gdf['orders per sq km'] = cluster_boundaries_gdf['orders'] / cluster_boundaries_gdf['area']


cluster_boundaries_gdf['current_date'] = datetime.now()
_, last_day_of_month = calendar.monthrange(cluster_boundaries_gdf['current_date'].dt.year.iloc[0], cluster_boundaries_gdf['current_date'].dt.month.iloc[0])
cluster_boundaries_gdf['gone_days'] = cluster_boundaries_gdf['current_date'].dt.day
cluster_boundaries_gdf['drr per sq km'] = cluster_boundaries_gdf['orders per sq km'] / cluster_boundaries_gdf['gone_days']
cluster_boundaries_gdf['drr'] = cluster_boundaries_gdf['orders'] / cluster_boundaries_gdf['gone_days']

print(cluster_boundaries_gdf)


   shipping_city cluster orders  \
0      Bangalore       1   3178   
1      Bangalore       2   7502   
2      Bangalore       3   5133   
3      Bangalore       4  13909   
4      Bangalore       5   4462   
5      Bangalore       6   5303   
6      Bangalore       7   9286   
7      Bangalore       8   5904   
8      Bangalore       9  14697   
9      Bangalore      10   5098   
10         Delhi       1  16659   
11         Delhi       2   7653   
12         Delhi       3   5715   
13         Delhi       4  14910   
14         Delhi       5   9543   
15         Delhi       6  14645   
16         Delhi       7  15971   
17         Delhi       8   4431   
18         Delhi       9  17701   
19         Delhi      10   2707   
20     Hyderabad       1   5905   
21     Hyderabad       2   7749   
22     Hyderabad       3   1837   
23     Hyderabad       4   4491   
24     Hyderabad       5   3745   
25     Hyderabad       6  11972   
26     Hyderabad       7   1971   
27     Hyderabad    

In [12]:
# FIltering datas based on cluster drr
filter_df = cluster_boundaries_gdf[['shipping_city', 'cluster', 'drr']].rename(columns={'drr': 'cluster_drr'})
print(filter_df)

   shipping_city cluster  cluster_drr
0      Bangalore       1   211.866667
1      Bangalore       2   500.133333
2      Bangalore       3        342.2
3      Bangalore       4   927.266667
4      Bangalore       5   297.466667
5      Bangalore       6   353.533333
6      Bangalore       7   619.066667
7      Bangalore       8        393.6
8      Bangalore       9        979.8
9      Bangalore      10   339.866667
10         Delhi       1       1110.6
11         Delhi       2        510.2
12         Delhi       3        381.0
13         Delhi       4        994.0
14         Delhi       5        636.2
15         Delhi       6   976.333333
16         Delhi       7  1064.733333
17         Delhi       8        295.4
18         Delhi       9  1180.066667
19         Delhi      10   180.466667
20     Hyderabad       1   393.666667
21     Hyderabad       2        516.6
22     Hyderabad       3   122.466667
23     Hyderabad       4        299.4
24     Hyderabad       5   249.666667
25     Hyder

In [13]:
clustered_dff = pd.merge(clustered_dff, filter_df, on=['shipping_city', 'cluster'], how='left', suffixes=('', '_cluster'))

cluster_dff = clustered_dff[clustered_dff['cluster_drr'] > 300.0]
cluster_boundaries_gdf = cluster_boundaries_gdf[cluster_boundaries_gdf['drr'] > 300]


# print(cluster_boundaries_gdf.head())
print(cluster_dff)

    pincode        state       district                  officename  \
0    560056    Karnataka      Bangalore     Bnagalore Viswavidalaya   
1    560011    Karnataka      Bangalore          Jayangar III Block   
2    560002    Karnataka      Bangalore  Sri Jayachamarajendra Road   
3    560017    Karnataka      Bangalore                         NAL   
4    560003    Karnataka      Bangalore                 Malleswaram   
..      ...          ...            ...                         ...   
366  400103  Maharashtra         Mumbai                Mandapeshwar   
367  400092  Maharashtra         Mumbai               Borivali West   
369  400083  Maharashtra         Mumbai             Kannamwar Nagar   
370  400089  Maharashtra         Mumbai        Tilak Nagar (Mumbai)   
371  400066  Maharashtra  Mumbai, Thane       Daulat Nagar (Mumbai)   

    officetype                                           geometry        area  \
0          S.O  POLYGON ((77.52018 12.95239, 77.52019 12.95237... 

In [14]:
# Initialize Dash app
app = Dash(__name__)

# Define app layout
app.layout = html.Div([
    html.H1("Orders per square km by pincode", style={'text-align': 'center'}),
    dcc.Input(
        id='search_shippingcity',
        type = 'text',
        placeholder='Search Shipping City...',
        value='Bangalore',
        style={'width': "40%"}
    ),
    dcc.Dropdown(
        id="slct_cluster",
        options=[
            {"label": f"Cluster {i}", "value": i} for i in range(1, 11)
        ],
        multi=True,
        value=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        style={'width': "100%"}
    ),
    html.Div(id='output_container_city', children=[]),
    html.Div(id='output_container_cluster', children=[]),
    html.Br(),
    html.Div([
        dcc.Graph(id='map', figure={}),
        dcc.Graph(id='map2', figure={}),
    ], style={'display': 'flex', 'justify-content': 'space-between'}),
])

# Callback to update the graph
@app.callback(
    [Output(component_id='output_container_city', component_property='children'),
     Output(component_id='output_container_cluster', component_property='children'),
     Output(component_id='map', component_property='figure')],
    [Input(component_id='search_shippingcity', component_property='value'),
     Input(component_id='slct_cluster', component_property='value' )]
)
def update_graph(city_slctd, cluster_slctd):
    if not city_slctd or city_slctd not in merged_gdf['shipping_city'].unique():
        return "", "", {}
    output_container_city = "Results are shown for the Shipping City: {}".format(city_slctd)
    output_container_cluster = f"Results are shown for Cluster: {cluster_slctd}"
    dff = cluster_dff[(cluster_dff["shipping_city"] == city_slctd) & (clustered_dff["cluster"].isin(cluster_slctd))]
    dff['cluster'] = dff['cluster'].astype(str)
    

    # Generate centroid
    centroid = dff.geometry.centroid
    centroid_wgs84 = centroid.to_crs(epsg=4326)
    dff['geometry'] = dff['geometry'].simplify(tolerance=0.001)
    
    fig = px.choropleth_mapbox(
        dff,  # Use filtered data from merged_gdf
        geojson=dff.geometry,  # Pass geometry from filtered data
        locations=dff.index,  # Use index from filtered data
        color='cluster',
        hover_data=['pincode', 'cluster'],
        mapbox_style="carto-positron",
        center={"lat": centroid_wgs84.y.mean(), "lon": centroid_wgs84.x.mean()},
        zoom=8.5,
        opacity=0.9,
        template='plotly_dark'
    )
    fig.update_traces(
        hovertemplate='<b>Pincode:</b> %{customdata[0]}<br><b>Place:</b> %{customdata[1]}<br><b>drr:</b> %{customdata[2]}',
        customdata=dff[['pincode', 'officename', 'drr']],  # Use 'pincode' and 'officename' as customdata
        text=dff['pincode'] # Set 'pincode' as text
    )
    return output_container_city, output_container_cluster, fig

@app.callback(
    Output(component_id='map2', component_property='figure'),
    [Input(component_id='search_shippingcity', component_property='value'),
     Input(component_id='slct_cluster', component_property='value')]
)
def update_map2(city_slctd, cluster_slctd):
    if not city_slctd or city_slctd not in merged_gdf['shipping_city'].unique():
        return {}
    
    # Filter data for the selected city and cluster
    dff2 = cluster_boundaries_gdf[(cluster_boundaries_gdf["shipping_city"] == city_slctd) & (cluster_boundaries_gdf["cluster"].isin(cluster_slctd))]
    dff2['drr per sq km'] = pd.to_numeric(dff2['drr per sq km'], errors='coerce')
    
    # Generate centroid
    centroid = dff2.geometry.centroid
    centroid_wgs84 = centroid.to_crs(epsg=4326)
    dff2['geometry'] = dff2['geometry'].simplify(tolerance=0.001)
    
    # Create the second map
    fig2 = px.choropleth_mapbox(
        dff2,
        geojson=dff2.geometry,
        locations=dff2.index,
        color='drr per sq km',
        hover_data=['cluster', 'drr per sq km'],
        mapbox_style="carto-positron",
        center={"lat": centroid_wgs84.y.mean(), "lon": centroid_wgs84.x.mean()},
        color_continuous_scale="RdYlGn",
        range_color=[dff2['drr per sq km'].min(), dff2['drr per sq km'].max()],
        zoom=8.5,
        opacity=0.9,
        template='plotly_dark'
    )
    
    return fig2



if __name__ == '__main__':
    app.run(debug=True, port=8053, mode='external')
#     app.run()
# print(grouped_dff[grouped_dff['shipping_city'] == 'Bangalore'])

In [15]:

cluster_level_data = cluster_boundaries_gdf[['shipping_city', 'cluster', 'number of pincodes', 'drr']].rename(columns={'shipping_city': 'city'}).astype({'drr': 'float'}).round({'drr': 0})

# Bangalore
bangalore = cluster_level_data[cluster_level_data['city'] == 'Bangalore'].sort_values(by='cluster')
bangalore.to_excel('cluster_bangalore.xlsx' , index=False)

# Mumbai
mumbai = cluster_level_data[cluster_level_data['city'] == 'Mumbai'].sort_values(by='cluster')
mumbai.to_excel('cluster_mumbai.xlsx', index=False)

# Delhi NCR
delhi_ncr = cluster_level_data[cluster_level_data['city'].isin(['Delhi', 'NCR'])].sort_values(by='cluster')
delhi_ncr.to_excel('cluster_delhi_ncr.xlsx', index=False)

# Hyderabad
hyderabad = cluster_level_data[cluster_level_data['city'] == 'Hyderabad'].sort_values(by='cluster')
hyderabad.to_excel('cluster_hyderabad.xlsx', index=False)

# Jaipur
jaipur = cluster_level_data[cluster_level_data['city'] == 'Jaipur'].sort_values(by='cluster')
jaipur.to_excel('cluster_jaipur.xlsx', index=False)

print(mumbai)

      city cluster  number of pincodes    drr
40  Mumbai       1                 5.0  312.0
41  Mumbai       2                14.0  384.0
42  Mumbai       3                 9.0  488.0
43  Mumbai       4                12.0  384.0
44  Mumbai       5                11.0  478.0
46  Mumbai       7                 7.0  417.0
47  Mumbai       8                 8.0  371.0


In [16]:
pincode_level_data = clustered_dff[['shipping_city', 'cluster', 'pincode', 'drr']].rename(columns={'shipping_city': 'city'}).round({'drr': 0})
print(pincode_level_data.head())

# Bangalore
bangalore = pincode_level_data[pincode_level_data['city'] == 'Bangalore'].sort_values(by='cluster')
bangalore.to_excel('pincode_bangalore.xlsx' , index=False)

# Mumbai
mumbai = pincode_level_data[pincode_level_data['city'] == 'Mumbai'].sort_values(by='cluster')
mumbai.to_excel('pincode_mumbai.xlsx' , index=False)

# Hyderabad
hyderabad = pincode_level_data[pincode_level_data['city'] == 'Hyderabad'].sort_values(by='cluster') 
hyderabad.to_excel('pincode_hyderabad.xlsx' , index=False)

# Delhi and NCR
delhi_ncr = pincode_level_data[pincode_level_data['city'].isin(['Delhi', 'NCR'])].sort_values(by='cluster')
delhi_ncr.to_excel('pincode_delhi_ncr.xlsx' , index=False)

# Jaipur
jaipur = pincode_level_data[pincode_level_data['city'] == 'Jaipur'].sort_values(by='cluster')
jaipur.to_excel('pincode_jaipur.xlsx' , index=False)



        city cluster pincode   drr
0  Bangalore       3  560056  47.0
1  Bangalore       2  560011  35.0
2  Bangalore       2  560002  13.0
3  Bangalore       9  560017  57.0
4  Bangalore      10  560003  34.0
