
### Data Transformation:
- The process of converting raw data into a more usable structure or format.
- Ensures data is in a consistent and structured state for analysis or processing.

In [7]:
pip install psycopg2

Note: you may need to restart the kernel to use updated packages.


In [3]:
import psycopg2
import pandas as pd
import numpy as np
import re


### Function `connectToPgsql()` : Connect to a PostgreSQL Database

In [6]:

def connectToPgsql() :
    conn = psycopg2.connect(database = "osm", 
                        user = "adityadaharwal", 
                        host= 'localhost',
                        password = "1234",
                        port = 5432)
    return conn
    


### Show All Data Types in Database

In [9]:
conn = connectToPgsql()

# show all type of dat in database
cur = conn.cursor()
cur.execute('SELECT way_id,tags,ST_X((ST_DumpPoints(geom)).geom) AS lon, ST_Y((ST_DumpPoints(geom)).geom) AS lat, geom FROM roads limit 5;')

# Fetch results
data = cur.fetchall()

# Get column names
colnames = [desc[0] for desc in cur.description]

# Create a DataFrame
df_roads = pd.DataFrame(data, columns=colnames)

conn.commit()
conn.close()

print(df_roads)

       way_id                  tags        lon        lat  \
0  1029686101  {'highway': 'track'}  68.262955  24.272271   
1  1029686101  {'highway': 'track'}  68.263811  24.269557   
2  1029686101  {'highway': 'track'}  68.264682  24.266833   
3  1029686101  {'highway': 'track'}  68.264598  24.264390   
4  1029686101  {'highway': 'track'}  68.265001  24.262094   

                                                geom  
0  0102000020E61000002B00000076543541D410514033A6...  
1  0102000020E61000002B00000076543541D410514033A6...  
2  0102000020E61000002B00000076543541D410514033A6...  
3  0102000020E61000002B00000076543541D410514033A6...  
4  0102000020E61000002B00000076543541D410514033A6...  


This code queries highway data with geographical coordinates, fetches the results, and displays them as a DataFrame.

In [12]:
connectToPgsql()

# get highway type
conn = connectToPgsql()

cur = conn.cursor()
cur.execute("SELECT way_id,tags->>'highway' AS road_type,ST_X((ST_DumpPoints(geom)).geom) AS lon, ST_Y((ST_DumpPoints(geom)).geom) AS lat FROM roads limit 10;")

# Fetch results
data = cur.fetchall()

# Get column names
colnames = [desc[0] for desc in cur.description]

# Create a DataFrame
df_highway = pd.DataFrame(data, columns=colnames)

conn.commit()
conn.close()

print(df_highway)

       way_id road_type        lon        lat
0  1029686101     track  68.262955  24.272271
1  1029686101     track  68.263811  24.269557
2  1029686101     track  68.264682  24.266833
3  1029686101     track  68.264598  24.264390
4  1029686101     track  68.265001  24.262094
5  1029686101     track  68.265265  24.260707
6  1029686101     track  68.266208  24.254792
7  1029686101     track  68.266758  24.251710
8  1029686101     track  68.267211  24.244441
9  1029686101     track  68.267728  24.241639


In [14]:
connectToPgsql()

# get highway type
conn = connectToPgsql()

# show all type of dat in database
cur = conn.cursor()
cur.execute("SELECT distinct tags->>'highway' AS road_type FROM roads;")

# Fetch results
data = cur.fetchall()

# Get column names
colnames = [desc[0] for desc in cur.description]

# Create a DataFrame
df_highway = pd.DataFrame(data, columns=colnames)

conn.commit()
conn.close()

print(df_highway)

         road_type
0        bridleway
1     bus_guideway
2         bus_stop
3           busway
4     construction
5         corridor
6         cycleway
7         elevator
8          footway
9    living_street
10        motorway
11   motorway_link
12            path
13      pedestrian
14        platform
15         primary
16    primary_link
17        proposed
18         raceway
19           razed
20     residential
21       rest_area
22            road
23       secondary
24  secondary_link
25         service
26        services
27           steps
28        tertiary
29   tertiary_link
30           track
31           trunk
32      trunk_link
33    unclassified


## Road type considered in this study

In [17]:
# divide the in these categories : 
# living_street : A living street is a street designed with the interests of pedestrians and cyclists , so we not considering that further
# residential
# tertiary
# tertiary_link
# secondary
# secondary_link
# primary
# primary_link
# trunk
# trunk_link



### Split Data by Road and Feature Types in DataFrame

In [None]:
# functions

def split_dataframe_by_road_type(df):
    # Create a dictionary to hold DataFrames for each road type
    road_type_dfs = {}
    
    # Set unique road types 
    unique_road_types = ['tertiary','tertiary_link','secondary','secondary_link','primary','primary_link','trunk','trunk_link']
    
    # Iterate over unique road types and create a separate DataFrame for each
    for road_type in unique_road_types:
        road_type_dfs[road_type] = df[df['tag'] == road_type].reset_index(drop=True)
    
    return road_type_dfs


def split_dataframe_by_features_type(df):
    # Create a dictionary to hold DataFrames for each road type
    features_type_dfs = {}
    
    # Set unique road types 
    unique_features_types = ['college','govt_institute','hotel','luxury_hotel','prime_hotel','malls','schools','it_companies','tourist_spot','hospital']
    
    # Iterate over unique road types and create a separate DataFrame for each
    for features_type in unique_features_types:
        features_type_dfs[features_type] = df[df['tag'] == features_type].reset_index(drop=True)
    
    return features_type_dfs


### Find nearest points in data

In [24]:
# Haversine formula to calculate distance between two points on the Earth's surface
def haversine(lon1, lat1, lon2, lat2):
    # Convert degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r * 1000 # distance in meter

def find_shortest_points(src_lat, src_lon, df,nPoints):
    # Calculate the distance for each point in the DataFrame
    df['distance'] = df.apply(lambda row: haversine(src_lon, src_lat, row['lon'], row['lat']), axis=1)
    
    # Sort the DataFrame by distance and return the three closest points
    closest_points = df.nsmallest(nPoints, 'distance')
    return closest_points[['lat', 'lon', 'distance']]

def get_shortest_point_and_mean(dict,tag,src_lat, src_lon,nPoints):
    df_temp = find_shortest_points(src_lat, src_lon, dict[tag],nPoints)
    # print(df_temp.distance.min(),df_temp.distance.mean(),tag)
    return df_temp.distance.min(),df_temp.distance.mean()


**Function**: `get_data_from_database`

---

**Input**:  
- `df` (DataFrame): A DataFrame containing columns `Lat` and `Lng` representing geographical coordinates for each row.  

**Output**:  
- Returns the updated DataFrame with additional columns representing distances to different road types and feature types.

**Desc**:  
This function connects to a PostgreSQL database to retrieve data on roads and features based on geographical coordinates provided in the input DataFrame. It performs the following steps:
1. **Initialize columns**: The function adds new columns to the DataFrame for storing distances to different types of roads (e.g., `tertiary`, `secondary`, `primary`, `trunk`) and features (e.g., `college`, `govt_institute`, `hotel`).
2. **Get road data**: It queries the database for road data (excluding residential and service roads) near each coordinate and calculates the shortest distances to various road types within a 3 km radius. It also handles distances for residential roads within a 500-meter radius.
3. **Get feature data**: It queries the database for feature data (e.g., colleges, hospitals) within a 6 km radius of each coordinate and calculates the shortest distances to the various features.
4. **Update the DataFrame**: For each row in the input DataFrame, the function calculates the required distances and updates the corresponding columns with the calculated values.
5. **Return**: The function commits changes to the database and closes the connection before returning the updated DataFrame.

This function is useful for spatial analysis, where you need to calculate distances from points of interest to various road types and features.

In [None]:
def get_data_from_database(df):

    #creating new columns
    road_types_columns = ['tertiary','secondary','primary','trunk']
    
    for column in road_types_columns:
        df.loc[:,f'{column}_road_dist'] = np.inf
        df.loc[:,f'{column}_road_mean_dist']= np.inf
        df.loc[:,f'{column}_link_dist']= np.inf
        df.loc[:,f'{column}_link_mean_dist']= np.inf

    feature_types_columns = ['college','govt_institute','hotel','luxury_hotel','prime_hotel','malls','schools','it_companies','tourist_spot','hospital']
    
    for column in feature_types_columns : 
        df.loc[:,f'{column}_dist'] = np.inf
        df.loc[:,f'{column}_mean_dist']= np.inf
    
    conn = connectToPgsql()
    cur = conn.cursor()

    for i,row in df.iterrows():
        
        # get road data (all types of road execpt residential)
        cur.execute(f"""
            SELECT way_id, tags->>'highway' AS tag, ST_X((ST_DumpPoints(geom)).geom) AS lon, ST_Y((ST_DumpPoints (geom)).geom) AS lat
            FROM roads
            WHERE ST_DWithin(
                geom,
                ST_SetSRID(ST_MakePoint({row.Lng},{row.Lat}), 4326),
                0.03  -- Distance in degree = (3 km)
                ) and tags->>'highway' != 'residential' and tags->>'highway'!= 'service';
            """)
    
        # Fetch results
        data = cur.fetchall()
        # Get column names
        colnames = [desc[0] for desc in cur.description]
        # Create a DataFrame
        df_highway = pd.DataFrame(data, columns=colnames)
        # spilt dataframe by category
        road_type_dfs = split_dataframe_by_road_type(df_highway)

    
        for colname in road_types_columns:
            row[f'{colname}_road_dist'] , row[f'{colname}_road_mean_dist'] = get_shortest_point_and_mean(road_type_dfs,colname,row.Lat,row.Lng,500)
            row[f'{colname}_link_dist'] , row[f'{colname}_link_mean_dist'] = get_shortest_point_and_mean(road_type_dfs,f'{colname}_link',row.Lat,row.Lng,10)
            #print(row[f'{column}_road_dist'] , row[f'{column}_road_mean_dist'])

        
        # get road data(residential) , there are too many residential ,therefore in searching 500 m only  
        cur.execute(f"""
            SELECT way_id, tags->>'highway' AS tag, ST_X((ST_DumpPoints(geom)).geom) AS lon, ST_Y((ST_DumpPoints(geom)).geom) AS lat
            FROM roads
            WHERE ST_DWithin(
                geom,
                ST_SetSRID(ST_MakePoint({row.Lng},{row.Lat}), 4326),
                0.005  -- Distance in degree = ( 500 m)
                ) and tags->>'highway' = 'residential';
            """)
        # Fetch results
        data = cur.fetchall()
        # Get column names
        colnames = [desc[0] for desc in cur.description]
        # Create a DataFrame
        df_highway_residential = pd.DataFrame(data, columns=colnames)
        df_highway_residential = find_shortest_points(row.Lat, row.Lng, df_highway_residential,1)
        # spilt dataframe by category
        row['residential_road_dist'] = df_highway_residential['distance'].iloc[0]

        # get features data   
        cur.execute(f"""
            SELECT review_count, rating, tag, public.ST_X(geom::geometry) AS lon, public.ST_Y(geom::geometry) AS lat 
            FROM features
            WHERE ST_DWithin(
                geom::geometry,
                ST_SetSRID(ST_MakePoint({row.Lng},{row.Lat}), 4326),
                0.06  -- Distance in degree = ( 6 km)
                );
            """)
        # Fetch results
        data = cur.fetchall()
        # Get column names
        colnames = [desc[0] for desc in cur.description]
        # Create a DataFrame
        df_features = pd.DataFrame(data, columns=colnames)
        # spilt dataframe by category
        features_type_dfs =  split_dataframe_by_features_type(df_features)
        for colname in feature_types_columns:
            row[f'{colname}_dist'] , row[f'{colname}_mean_dist'] = get_shortest_point_and_mean(features_type_dfs,colname,row.Lat,row.Lng,20)

        #update the dataframe
        df.loc[i] = row
        
    conn.commit()
    conn.close()

    return df 
    

In [28]:
import warnings
warnings.filterwarnings('ignore')


In [38]:
%%time

# Property dataset
property_dataset = pd.read_csv('./CustomDataset.csv')
dataset = get_data_from_database(property_dataset)


len(dataset)

297.39515805721237 1739.175054672102 tertiary
2282.341509106695 2792.110049776672 tertiary_link
297.39515805721237 941.9868678039929 secondary
830.4610324964924 847.3697533798155 secondary_link
1106.3867708090102 2435.4809806794046 primary
1105.5217646740828 1603.6535556896572 primary_link
2027.910000610484 2870.008668044689 trunk
2081.278681834842 2118.885545996921 trunk_link
746.8819049733947 2202.7507348301046 college
501.76262438751587 2163.4083532432783 govt_institute
688.4559397543064 2697.1258208392983 hotel
905.2457709170428 2966.9111817682215 luxury_hotel
544.824920352361 1894.645503468352 prime_hotel
485.7378079213591 2070.420246128022 malls
1004.730276060742 2010.278518322038 schools
307.24468902122123 817.5894166541672 it_companies
900.3108164441661 2081.101360085616 tourist_spot
556.843272468316 1172.438692596444 hospital
65.42837483900755 1682.2445055168414 tertiary
2082.438246115905 2532.193133213763 tertiary_link
209.74776997229017 890.9204123865863 secondary
590.768094

77

In [48]:
dataset.head()
column_names = dataset.columns

print(column_names)

Index(['Lat', 'Lng', 'Type', 'Price', 'Sqft', 'Bhk', 'tertiary_road_dist',
       'tertiary_road_mean_dist', 'tertiary_link_dist',
       'tertiary_link_mean_dist', 'secondary_road_dist',
       'secondary_road_mean_dist', 'secondary_link_dist',
       'secondary_link_mean_dist', 'primary_road_dist',
       'primary_road_mean_dist', 'primary_link_dist', 'primary_link_mean_dist',
       'trunk_road_dist', 'trunk_road_mean_dist', 'trunk_link_dist',
       'trunk_link_mean_dist', 'college_dist', 'college_mean_dist',
       'govt_institute_dist', 'govt_institute_mean_dist', 'hotel_dist',
       'hotel_mean_dist', 'luxury_hotel_dist', 'luxury_hotel_mean_dist',
       'prime_hotel_dist', 'prime_hotel_mean_dist', 'malls_dist',
       'malls_mean_dist', 'schools_dist', 'schools_mean_dist',
       'it_companies_dist', 'it_companies_mean_dist', 'tourist_spot_dist',
       'tourist_spot_mean_dist', 'hospital_dist', 'hospital_mean_dist'],
      dtype='object')


In [42]:
# Save the DataFrame to a CSV file
dataset.to_csv('dataset.csv', index=False)

In [242]:
property_dataset.loc[0]

Lat       21.127504
Lng        79.05486
Type      Land/Plot
Price      18000000
Sqft          13333
Bhk               0
Name: 0, dtype: object

In [196]:
%%time

connectToPgsql()

# get highway type
conn = connectToPgsql()

# show all type of dat in database
cur = conn.cursor()
cur.execute("SELECT way_id, tags->>'highway' AS tag, ST_X((ST_DumpPoints(geom)).geom) AS lon, ST_Y((ST_DumpPoints(geom)).geom) AS lat FROM roads WHERE ST_DWithin(geom,ST_SetSRID(ST_MakePoint(79.0917279, 21.1231725), 4326), 0.03 ) and tags->>'highway' != 'residential' and tags->>'highway'!= 'service';")

# Fetch results
data = cur.fetchall()

# Get column names
colnames = [desc[0] for desc in cur.description]

# Create a DataFrame
df_highway = pd.DataFrame(data, columns=colnames)

conn.commit()
conn.close()

print(df_highway)

         way_id        tag        lon        lat
0     505042335  secondary  79.068463  21.104287
1     505042335  secondary  79.068244  21.104394
2      27082729  secondary  79.076091  21.128580
3      27082729  secondary  79.075979  21.128553
4      27082729  secondary  79.075849  21.128515
...         ...        ...        ...        ...
7081   27116911  secondary  79.060589  21.135667
7082   27116911  secondary  79.060553  21.135872
7083   27116911  secondary  79.060529  21.135960
7084   27116911  secondary  79.060517  21.136011
7085   27116911  secondary  79.060492  21.136115

[7086 rows x 4 columns]
CPU times: user 24.2 ms, sys: 16.3 ms, total: 40.5 ms
Wall time: 157 ms


In [124]:
%%time

r=split_dataframe_by_road_type(df_highway)
r['primary'].head()
d  = find_shortest_points( 21.1231725, 79.0917279, r['primary'],500)
d.head()

CPU times: user 24 ms, sys: 2.66 ms, total: 26.7 ms
Wall time: 25.2 ms


Unnamed: 0,lat,lon,distance
597,21.134982,79.103793,1813.975972
598,21.135084,79.10368,1814.096091
596,21.134886,79.103938,1816.701724
595,21.134858,79.103994,1818.487748
599,21.135309,79.103525,1821.615874


In [39]:
r['primary'].head()

Unnamed: 0,way_id,tag,lon,lat,distance
0,304334916,primary,79.069987,21.110706,2647.089743
1,304334916,primary,79.069982,21.110707,2647.476588
2,304334916,primary,79.069424,21.11084,2689.430926
3,304334916,primary,79.068916,21.111006,2725.674726
4,304334916,primary,79.067909,21.11134,2799.142818


In [37]:
road_type_dfs = split_dataframe_by_road_type(df_highway)
road_type_dfs.keys()

dict_keys(['residential', 'tertiary', 'tertiary_link', 'secondary', 'secondary_link', 'primary', 'primary_link', 'trunk', 'trunk_link'])

In [163]:
# get lat lon
# get all roads near by from database 
# sort in catergory 
# find top 20
# if null, set infinite 
# search features near by 
# find top three 
# find top three 

# create data frame 

In [165]:
# note 
# 0.0001 degrees: 11.1 meters
# 0.00001 degrees: 1.11 meters

In [49]:
# # Iterate through the DataFrame using itertuples
# # for row in r.itertuples(index=True):
#     #print(f"Index: {row.Index}, Way ID: {row.way_id}, Road Type: {row.road_type}, Lon: {row.lon}, Lat: {row.lat}")
# def process_row(row):
#     return f"Way ID: {row['way_id']}, Road Type: {row['road_type']}"

# # Apply function to each row
# results = r.apply(process_row, axis=1)
# result.head()

In [None]:
tertiary_road = road_type_dfs['tertiary']
tertiary_link = road_type_dfs['tertiary_link']
secondary_road = road_type_dfs['secondary']
secondary_link = road_type_dfs['secondary_link']
primary_road = road_type_dfs['primary']
primary_link = road_type_dfs['primary_link']
trunk_road = road_type_dfs['trunk']
trunk_link = road_type_dfs['trunk_link']
