In [1]:
import pandas as pd

In [4]:
geocoded_area = pd.read_csv('combined_geocoded_areas.csv')
sales_data = pd.read_csv('../data/snp_dld_2024_transactions.csv')

In [11]:
location_columns_sales = [
    "project_name_en",
    "nearest_landmark_en",
    "nearest_metro_en",
    "nearest_mall_en",
    "master_project_en",
]

In [12]:


# Grouping by 'area_en' and counting distinct values for each location column
location_counts = sales_data.groupby("area_en")[location_columns_sales].nunique()

# Rename columns to reflect the count information
location_counts = location_counts.rename(columns={col: f"{col}_count" for col in location_columns_sales if col != "area_en"})



In [14]:
location_counts.info()


<class 'pandas.core.frame.DataFrame'>
Index: 273 entries, AL Athbah to Zaabeel Second
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   project_name_en_count      273 non-null    int64
 1   nearest_landmark_en_count  273 non-null    int64
 2   nearest_metro_en_count     273 non-null    int64
 3   nearest_mall_en_count      273 non-null    int64
 4   master_project_en_count    273 non-null    int64
dtypes: int64(5)
memory usage: 12.8+ KB


In [17]:
# Merge the DataFrames on 'area_en'
merged_df = pd.merge(
    location_counts,  # Use location_counts as the base to preserve its order
    geocoded_area,
    on="area_en",
    how="left"  # Ensure all rows in location_counts are retained
)

# Save the combined DataFrame to a CSV
merged_df.to_csv("ordered_combined_geocoded_with_counts.csv", index=False)

# Print a summary
print(f"Combined DataFrame saved with shape: {merged_df.shape}")
print(merged_df.head())


Combined DataFrame saved with shape: (273, 8)
            area_en  project_name_en_count  nearest_landmark_en_count  \
0         AL Athbah                      0                          0   
1         AL BARARI                      2                          1   
2         AL FURJAN                     63                          2   
3  AL KHAIL HEIGHTS                      1                          1   
4           AL WAHA                      1                          1   

   nearest_metro_en_count  nearest_mall_en_count  master_project_en_count  \
0                       0                      0                        0   
1                       1                      0                        0   
2                       2                      1                        1   
3                       1                      1                        0   
4                       0                      0                        0   

    latitude  longitude  
0  25.186046  55.475980  


# Utilizing the coordinates.

In [21]:
sales_data['nearest_landmark_en'].unique()

array(['Sports City Swimming Academy', 'IMG World Adventures', nan,
       'Dubai International Airport', 'Downtown Dubai', 'Motor City',
       'Burj Al Arab', 'Dubai Cycling Course', 'Burj Khalifa',
       'Hamdan Sports Complex', 'Global Village', 'Expo 2020 Site',
       'Al Makhtoum International Airport', 'Dubai Parks and Resorts',
       'Jabel Ali'], dtype=object)

In [24]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import os
import time

# Initialize geolocator
geolocator = Nominatim(user_agent="geoapi")

# Landmarks to process
landmarks = [
    'Sports City Swimming Academy', 'IMG World Adventures',
    'Dubai International Airport', 'Downtown Dubai', 'Motor City',
    'Burj Al Arab', 'Dubai Cycling Course', 'Burj Khalifa',
    'Hamdan Sports Complex', 'Global Village', 'Expo 2020 Site',
    'Al Makhtoum International Airport', 'Dubai Parks and Resorts',
    'Jabel Ali'
]

# Define Dubai's approximate geographic bounds
DUBAI_BOUNDS = {
    "north": 25.40,  # Approx. northern latitude
    "south": 24.85,  # Approx. southern latitude
    "east": 55.60,   # Approx. eastern longitude
    "west": 55.00    # Approx. western longitude
}

# Load cached coordinates if available
cache_file = "landmark_coordinates.csv"
if os.path.exists(cache_file):
    landmark_coords = pd.read_csv(cache_file)
    print(f"Loaded cached landmark coordinates from '{cache_file}'.")
else:
    # Initialize empty DataFrame
    landmark_coords = pd.DataFrame(columns=['landmark', 'latitude', 'longitude'])

# Get a list of landmarks already processed
processed_landmarks = set(landmark_coords['landmark'])

# Find missing landmarks
missing_landmarks = [lm for lm in landmarks if lm not in processed_landmarks]
print(f"Missing landmarks to process: {missing_landmarks}")

# Function to check if coordinates are within Dubai's boundaries
def is_within_dubai(lat, lon):
    if lat is None or lon is None:
        return False
    return DUBAI_BOUNDS["south"] <= lat <= DUBAI_BOUNDS["north"] and DUBAI_BOUNDS["west"] <= lon <= DUBAI_BOUNDS["east"]

# Function to fetch coordinates for a given landmark
def fetch_coordinates(landmark):
    try:
        location = geolocator.geocode(f"{landmark}, Dubai")
        if location and is_within_dubai(location.latitude, location.longitude):
            print(f"Found coordinates for '{landmark}': ({location.latitude}, {location.longitude})")
            return location.latitude, location.longitude
        else:
            print(f"Could not find valid coordinates for '{landmark}'.")
            return None, None
    except GeocoderTimedOut:
        print(f"Timeout error while fetching coordinates for '{landmark}'. Retrying...")
        return fetch_coordinates(landmark)
    except Exception as e:
        print(f"Error fetching coordinates for '{landmark}': {e}")
        return None, None

# Process missing landmarks
for landmark in missing_landmarks:
    lat, lon = fetch_coordinates(landmark)
    # Append the result to the DataFrame
    landmark_coords = pd.concat(
        [landmark_coords, pd.DataFrame({'landmark': [landmark], 'latitude': [lat], 'longitude': [lon]})],
        ignore_index=True
    )
    time.sleep(1)  # Respect API rate limits

# Save updated coordinates to cache
landmark_coords.to_csv(cache_file, index=False)
print(f"Updated landmark coordinates saved to '{cache_file}'.")


Missing landmarks to process: ['Sports City Swimming Academy', 'IMG World Adventures', 'Dubai International Airport', 'Downtown Dubai', 'Motor City', 'Burj Al Arab', 'Dubai Cycling Course', 'Burj Khalifa', 'Hamdan Sports Complex', 'Global Village', 'Expo 2020 Site', 'Al Makhtoum International Airport', 'Dubai Parks and Resorts', 'Jabel Ali']
Could not find valid coordinates for 'Sports City Swimming Academy'.
Could not find valid coordinates for 'IMG World Adventures'.
Found coordinates for 'Dubai International Airport': (25.2521287, 55.365715576690505)


  landmark_coords = pd.concat(


Found coordinates for 'Downtown Dubai': (25.19475815, 55.27805910442858)
Found coordinates for 'Motor City': (25.047690199999998, 55.23820948643042)
Found coordinates for 'Burj Al Arab': (25.141327099999998, 55.18539672753495)
Could not find valid coordinates for 'Dubai Cycling Course'.
Found coordinates for 'Burj Khalifa': (25.197033599999997, 55.27413294647308)
Found coordinates for 'Hamdan Sports Complex': (25.051092949999997, 55.31808411322436)
Found coordinates for 'Global Village': (25.0687531, 55.30683291261017)
Found coordinates for 'Expo 2020 Site': (24.966807, 55.15825718982431)
Could not find valid coordinates for 'Al Makhtoum International Airport'.
Found coordinates for 'Dubai Parks and Resorts': (24.9198548, 55.00868)
Found coordinates for 'Jabel Ali': (25.01605595, 55.13787589174201)
Updated landmark coordinates saved to 'landmark_coordinates.csv'.


# Bad API, the rest is filled manually

In [25]:
sales_data['nearest_mall_en'].unique()

array(['Mall of the Emirates', 'Marina Mall', nan, 'Ibn-e-Battuta Mall',
       'City Centre Mirdif', 'Dubai Mall'], dtype=object)

In [29]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import os
import time

# Initialize geolocator
geolocator = Nominatim(user_agent="geoapi")

# Load dataset and extract unique nearest malls
nearest_malls = sales_data['nearest_mall_en'].dropna().unique()

# Define Dubai's approximate geographic bounds
DUBAI_BOUNDS = {
    "north": 25.40,  # Approx. northern latitude
    "south": 24.85,  # Approx. southern latitude
    "east": 55.60,   # Approx. eastern longitude
    "west": 55.00    # Approx. western longitude
}

# Load cached coordinates if available
cache_file = "mall_coordinates.csv"
if os.path.exists(cache_file):
    mall_coords = pd.read_csv(cache_file)
    print(f"Loaded cached mall coordinates from '{cache_file}'.")
else:
    # Initialize empty DataFrame
    mall_coords = pd.DataFrame(columns=['mall', 'latitude', 'longitude'])

# Get a list of malls already processed
processed_malls = set(mall_coords['mall'])

# Find missing malls
missing_malls = [mall for mall in nearest_malls if mall not in processed_malls]
print(f"Missing malls to process: {missing_malls}")

# Function to check if coordinates are within Dubai's boundaries
def is_within_dubai(lat, lon):
    if lat is None or lon is None:
        return False
    return DUBAI_BOUNDS["south"] <= lat <= DUBAI_BOUNDS["north"] and DUBAI_BOUNDS["west"] <= lon <= DUBAI_BOUNDS["east"]

# Function to fetch coordinates for a given mall
def fetch_coordinates(mall):
    try:
        location = geolocator.geocode(f"{mall}, Dubai")
        if location and is_within_dubai(location.latitude, location.longitude):
            print(f"Found coordinates for '{mall}': ({location.latitude}, {location.longitude})")
            return location.latitude, location.longitude
        else:
            print(f"Could not find valid coordinates for '{mall}'.")
            return None, None
    except GeocoderTimedOut:
        print(f"Timeout error while fetching coordinates for '{mall}'. Retrying...")
        return fetch_coordinates(mall)
    except Exception as e:
        print(f"Error fetching coordinates for '{mall}': {e}")
        return None, None

# Process missing malls
for mall in missing_malls:
    lat, lon = fetch_coordinates(mall)
    # Append the result to the DataFrame
    mall_coords = pd.concat(
        [mall_coords, pd.DataFrame({'mall': [mall], 'latitude': [lat], 'longitude': [lon]})],
        ignore_index=True
    )
    time.sleep(1)  # Respect API rate limits

# Save updated coordinates to cache
mall_coords.to_csv(cache_file, index=False)
print(f"Updated mall coordinates saved to '{cache_file}'.")


Missing malls to process: ['Mall of the Emirates', 'Marina Mall', 'Ibn-e-Battuta Mall', 'City Centre Mirdif', 'Dubai Mall']
Found coordinates for 'Mall of the Emirates': (25.1212029, 55.2004469)


  mall_coords = pd.concat(


Found coordinates for 'Marina Mall': (25.0776245, 55.14005375)
Could not find valid coordinates for 'Ibn-e-Battuta Mall'.
Found coordinates for 'City Centre Mirdif': (25.2173429, 55.4071287)
Found coordinates for 'Dubai Mall': (25.197043999999998, 55.27895163264769)
Updated mall coordinates saved to 'mall_coordinates.csv'.


In [30]:
sales_data['master_project_en'].unique()

array([nan, 'Palace Beach Residence',
       'Jebel Ali Village Townhouses- Phase 1', 'Hills Park',
       ' Mohammed Bin Rashid Al Maktoum City , District One Phase III  , Residences 22',
       'Remraam - Al Ramth', 'JANNAT', 'Elysee Heights'], dtype=object)

In [31]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import os
import time

# Initialize geolocator
geolocator = Nominatim(user_agent="geoapi")

# Load dataset and extract unique master projects
master_projects = sales_data['master_project_en'].dropna().unique()

# Define Dubai's approximate geographic bounds
DUBAI_BOUNDS = {
    "north": 25.40,  # Approx. northern latitude
    "south": 24.85,  # Approx. southern latitude
    "east": 55.60,   # Approx. eastern longitude
    "west": 55.00    # Approx. western longitude
}

# Load cached coordinates if available
cache_file = "master_project_coordinates.csv"
if os.path.exists(cache_file):
    master_project_coords = pd.read_csv(cache_file)
    print(f"Loaded cached master project coordinates from '{cache_file}'.")
else:
    # Initialize empty DataFrame
    master_project_coords = pd.DataFrame(columns=['master_project', 'latitude', 'longitude'])

# Get a list of master projects already processed
processed_projects = set(master_project_coords['master_project'])

# Find missing master projects
missing_projects = [project for project in master_projects if project not in processed_projects]
print(f"Missing master projects to process: {missing_projects}")

# Function to check if coordinates are within Dubai's boundaries
def is_within_dubai(lat, lon):
    if lat is None or lon is None:
        return False
    return DUBAI_BOUNDS["south"] <= lat <= DUBAI_BOUNDS["north"] and DUBAI_BOUNDS["west"] <= lon <= DUBAI_BOUNDS["east"]

# Function to fetch coordinates for a given master project
def fetch_coordinates(project):
    try:
        location = geolocator.geocode(f"{project}, Dubai")
        if location and is_within_dubai(location.latitude, location.longitude):
            print(f"Found coordinates for '{project}': ({location.latitude}, {location.longitude})")
            return location.latitude, location.longitude
        else:
            print(f"Could not find valid coordinates for '{project}'.")
            return None, None
    except GeocoderTimedOut:
        print(f"Timeout error while fetching coordinates for '{project}'. Retrying...")
        return fetch_coordinates(project)
    except Exception as e:
        print(f"Error fetching coordinates for '{project}': {e}")
        return None, None

# Process missing master projects
for project in missing_projects:
    lat, lon = fetch_coordinates(project)
    # Append the result to the DataFrame
    master_project_coords = pd.concat(
        [master_project_coords, pd.DataFrame({'master_project': [project], 'latitude': [lat], 'longitude': [lon]})],
        ignore_index=True
    )
    time.sleep(1)  # Respect API rate limits

# Save updated coordinates to cache
master_project_coords.to_csv(cache_file, index=False)
print(f"Updated master project coordinates saved to '{cache_file}'.")


Missing master projects to process: ['Palace Beach Residence', 'Jebel Ali Village Townhouses- Phase 1', 'Hills Park', ' Mohammed Bin Rashid Al Maktoum City , District One Phase III  , Residences 22', 'Remraam - Al Ramth', 'JANNAT', 'Elysee Heights']
Could not find valid coordinates for 'Palace Beach Residence'.
Could not find valid coordinates for 'Jebel Ali Village Townhouses- Phase 1'.
Found coordinates for 'Hills Park': (25.0438747, 55.157239087258546)


  master_project_coords = pd.concat(


Could not find valid coordinates for ' Mohammed Bin Rashid Al Maktoum City , District One Phase III  , Residences 22'.
Could not find valid coordinates for 'Remraam - Al Ramth'.
Could not find valid coordinates for 'JANNAT'.
Could not find valid coordinates for 'Elysee Heights'.
Updated master project coordinates saved to 'master_project_coordinates.csv'.
