In [1]:
import pandas as pd


In [2]:
# Load the datasets
original_sales_data = pd.read_csv('../data/snp_dld_2024_transactions.csv', low_memory=False)
original_rentals_data = pd.read_csv('../data/snp_dld_2024_rents.csv', low_memory=False)
# Load the processed dfs
sales_data = pd.read_csv('../data/sales_data_no_missing.csv', low_memory=False)
rentals_data = pd.read_csv('../data/rentals_data_no_missing.csv', low_memory=False)

In [3]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162806 entries, 0 to 162805
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   transaction_type_id             162806 non-null  int64  
 1   registration_type_en            162806 non-null  object 
 2   property_usage_id               162806 non-null  int64  
 3   amount                          162806 non-null  float64
 4   total_buyer                     162806 non-null  int64  
 5   total_seller                    162806 non-null  int64  
 6   transaction_size_sqm            162806 non-null  float64
 7   property_size_sqm               162806 non-null  float64
 8   is_offplan                      162806 non-null  object 
 9   is_freehold                     162806 non-null  object 
 10  property_type_en                162806 non-null  object 
 11  property_subtype_en             162806 non-null  object 
 12  project_name_en 

# Utilizing location info to infer tempotal features. 

In [4]:
sales_data['area_en'].isnull().sum()

0

In [5]:
len(sales_data['area_en'].unique())

273

In [6]:
unique_area_count = sales_data['area_en'].nunique()
print("Number of unique values in 'area_en':", unique_area_count)


Number of unique values in 'area_en': 273


In [7]:
import time
import pandas as pd
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geoapi")
geocoded_areas = {}

def fetch_lat_long_cached(area):
    if area in geocoded_areas:
        return geocoded_areas[area]
    try:
        location = geolocator.geocode(area)
        if location:
            geocoded_areas[area] = (location.latitude, location.longitude)
        else:
            geocoded_areas[area] = (None, None)
    except:
        geocoded_areas[area] = (None, None)
    return geocoded_areas[area]

# Apply geocoding with caching
sales_data['latitude'], sales_data['longitude'] = zip(
    *sales_data['area_en'].apply(fetch_lat_long_cached)
)

# Save cached results
pd.DataFrame.from_dict(geocoded_areas, orient='index', columns=['latitude', 'longitude']).to_csv("geocoded_areas.csv")

# Delay for API rate limits
time.sleep(1)


In [7]:
import os
import time
import pandas as pd
from geopy.geocoders import Nominatim
from concurrent.futures import ThreadPoolExecutor

# Initialize geolocator and cache
geolocator = Nominatim(user_agent="geoapi")
geocoded_areas = {}

# Load cached geocoded data if available
cache_file = "geocoded_areas.csv"
if os.path.exists(cache_file):
    print("Loading cached geocoded areas...")
    geocoded_areas_df = pd.read_csv(cache_file, index_col=0)
    geocoded_areas = {
        area: (row['latitude'], row['longitude'])
        for area, row in geocoded_areas_df.iterrows()
    }
else:
    print("No cache found. Starting fresh.")

# Function to fetch coordinates with caching
def fetch_lat_long_cached(area_en, area_ar):
    if area_en in geocoded_areas:
        return geocoded_areas[area_en]
    if area_ar in geocoded_areas:
        return geocoded_areas[area_ar]

    try:
        location = geolocator.geocode(area_en) or geolocator.geocode(area_ar)
        if location:
            geocoded_areas[area_en] = (location.latitude, location.longitude)
            return location.latitude, location.longitude
    except Exception as e:
        print(f"Error fetching {area_en}/{area_ar}: {e}")
    
    geocoded_areas[area_en] = (None, None)
    return None, None

# Function to apply geocoding in parallel
def process_row(row):
    return fetch_lat_long_cached(row['area_en'], row['area_ar'])


Loading cached geocoded areas...


In [8]:

sales_data['area_ar'] = original_sales_data['area_ar']
# Perform geocoding with threading
print("Starting geocoding...")
start_time = time.time()
with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(
        process_row,
        [row for _, row in sales_data[['area_en', 'area_ar']].iterrows()]
    ))
end_time = time.time()
print(f"Geocoding completed in {end_time - start_time:.2f} seconds.")

# Update DataFrame with results
sales_data['latitude'], sales_data['longitude'] = zip(*results)

# Save updated geocoded data to cache
geocoded_areas_df = pd.DataFrame.from_dict(geocoded_areas, orient='index', columns=['latitude', 'longitude'])
geocoded_areas_df.to_csv(cache_file)
print(f"Cached geocoded areas saved to {cache_file}")
sales_data['latitude'], sales_data['longitude'] = zip(
    *sales_data.apply(lambda row: fetch_lat_long_cached(row['area_en'], row['area_ar']), axis=1)
)

# Save geocoded results
geocoded_df = pd.DataFrame.from_dict(geocoded_areas, orient='index')
geocoded_df.to_csv("fallback_geocoded_areas.csv")

Starting geocoding...
Geocoding completed in 23.19 seconds.
Cached geocoded areas saved to geocoded_areas.csv


In [20]:
# Convert geocoded_areas to a DataFrame with proper indexing
geocoded_df = pd.DataFrame.from_dict(geocoded_areas, orient='index', columns=['latitude', 'longitude'])

# Reset index to use area names properly
geocoded_df.index.name = 'area_en'
geocoded_df.reset_index(inplace=True)

# Check the updated DataFrame
print(geocoded_df.head())


                             area_en   latitude  longitude
0            JUMEIRAH VILLAGE CIRCLE  25.052752  55.205766
1                      SILICON OASIS  12.868538  77.660113
2                DUBAI MARITIME CITY  25.254858  55.275662
3  Hadaeq Sheikh Mohammed Bin Rashid  25.119017  55.261877
4          JUMEIRAH VILLAGE TRIANGLE  25.049843  55.190701


In [27]:
geocoded_df

Unnamed: 0,area_en,latitude,longitude
0,JUMEIRAH VILLAGE CIRCLE,25.052752,55.205766
1,SILICON OASIS,12.868538,77.660113
2,DUBAI MARITIME CITY,25.254858,55.275662
3,Hadaeq Sheikh Mohammed Bin Rashid,25.119017,55.261877
4,JUMEIRAH VILLAGE TRIANGLE,25.049843,55.190701
...,...,...,...
268,Mena Jabal Ali,,
269,Saih Shuaib 1,24.902423,55.005968
270,Al Yelayiss 1,,
271,Nazwah,25.018781,55.645711


In [28]:
# Identify rows where latitude or longitude is None
missing_coords = geocoded_df[geocoded_df[['latitude', 'longitude']].isnull().any(axis=1)]

# Display areas with missing coordinates
missing_coords[['area_en', 'latitude', 'longitude']]


Unnamed: 0,area_en,latitude,longitude
10,Madinat Dubai Almelaheyah,,
14,TECOM SITE A,,
20,ARABIAN RANCHES III,,
23,Al Merkadh,,
24,SOBHA HEARTLAND,,
...,...,...,...
263,Al Warsan Third,,
264,Muragab,,
266,Al Qusais Industrial Fifth,,
268,Mena Jabal Ali,,


In [29]:
# Identify rows where both latitude and longitude are not null
non_missing_coords = geocoded_df[geocoded_df[['latitude', 'longitude']].notnull().all(axis=1)]

# Display areas with non-missing coordinates
print(non_missing_coords[['area_en', 'latitude', 'longitude']])



                               area_en   latitude  longitude
0              JUMEIRAH VILLAGE CIRCLE  25.052752  55.205766
1                        SILICON OASIS  12.868538  77.660113
2                  DUBAI MARITIME CITY  25.254858  55.275662
3    Hadaeq Sheikh Mohammed Bin Rashid  25.119017  55.261877
4            JUMEIRAH VILLAGE TRIANGLE  25.049843  55.190701
..                                 ...        ...        ...
265               DUBAI LIFESTYLE CITY  25.042603  55.282163
267                          Al Buteen  25.259523  55.319510
269                      Saih Shuaib 1  24.902423  55.005968
271                             Nazwah  25.018781  55.645711
272                           Mugatrah  24.792346  55.195184

[170 rows x 3 columns]


In [13]:
geocoded_df = pd.DataFrame.from_dict(
    geocoded_areas, orient="index", columns=["latitude", "longitude"]
)


In [14]:
missing_values = geocoded_df.isnull().sum()
print("Missing values in geocoded_df:")
print(missing_values)


Missing values in geocoded_df:
latitude     103
longitude    103
dtype: int64


In [15]:
geocoded_df

Unnamed: 0,latitude,longitude
JUMEIRAH VILLAGE CIRCLE,25.052752,55.205766
SILICON OASIS,12.868538,77.660113
DUBAI MARITIME CITY,25.254858,55.275662
Hadaeq Sheikh Mohammed Bin Rashid,25.119017,55.261877
JUMEIRAH VILLAGE TRIANGLE,25.049843,55.190701
...,...,...
Mena Jabal Ali,,
Saih Shuaib 1,24.902423,55.005968
Al Yelayiss 1,,
Nazwah,25.018781,55.645711


In [16]:
# Rename the columns to 'latitude' and 'longitude'
geocoded_df.columns = ['latitude', 'longitude']

# Verify the changes
print(geocoded_df.head())
print(geocoded_df.isnull().sum())


                                    latitude  longitude
JUMEIRAH VILLAGE CIRCLE            25.052752  55.205766
SILICON OASIS                      12.868538  77.660113
DUBAI MARITIME CITY                25.254858  55.275662
Hadaeq Sheikh Mohammed Bin Rashid  25.119017  55.261877
JUMEIRAH VILLAGE TRIANGLE          25.049843  55.190701
latitude     103
longitude    103
dtype: int64


In [18]:
geocoded_df

<class 'pandas.core.frame.DataFrame'>
Index: 273 entries, JUMEIRAH VILLAGE CIRCLE to Mugatrah
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   latitude   273 non-null    bool 
 1   longitude  273 non-null    bool 
dtypes: bool(2)
memory usage: 2.7+ KB


In [19]:
# Number of unique area_en in the sales_data
unique_area_en_in_sales = sales_data['area_en'].nunique()

# Number of geocoded areas in the geocoded_areas dictionary
unique_geocoded_areas = len(geocoded_areas)

print(f"Number of unique areas in sales_data: {unique_area_en_in_sales}")
print(f"Number of successfully geocoded areas: {unique_geocoded_areas}")


Number of unique areas in sales_data: 273
Number of successfully geocoded areas: 273


In [30]:
import os
import pandas as pd
from geopy.geocoders import Nominatim
import time

# Initialize geolocator and load cache if available
geolocator = Nominatim(user_agent="geoapi")
geocoded_projects = {}

# Load cached projects if the file exists
project_cache_file = "geocoded_projects.csv"
if os.path.exists(project_cache_file):
    geocoded_projects = pd.read_csv(project_cache_file).set_index('project_name_en').T.to_dict(orient="list")

# Function to fetch latitude and longitude with caching
def fetch_lat_long_cached(entity, cache):
    if entity in cache:
        return cache[entity]
    try:
        location = geolocator.geocode(entity)
        if location:
            cache[entity] = (location.latitude, location.longitude)
        else:
            cache[entity] = (None, None)
    except:
        cache[entity] = (None, None)
    return cache[entity]

# Fill missing latitude and longitude
def fill_missing_coords(df, area_column, project_column):
    global geocoded_projects  # To allow caching
    missing_coords = df[df['latitude'].isnull() & df['longitude'].isnull()]
    
    for index, row in missing_coords.iterrows():
        area = row[area_column]
        project = row[project_column]
        
        # First, try using the area
        lat, lon = fetch_lat_long_cached(area, geocoded_projects)
        if lat is None or lon is None:
            # If area fails, fallback to project_name_en
            lat, lon = fetch_lat_long_cached(project, geocoded_projects)
        
        # Update DataFrame if coordinates were found
        if lat is not None and lon is not None:
            df.at[index, 'latitude'] = lat
            df.at[index, 'longitude'] = lon
    
    return df

# Apply the function to your DataFrame
sales_data = fill_missing_coords(sales_data, 'area_en', 'project_name_en')

# Save updated project geocodes to the cache file
project_df = pd.DataFrame.from_dict(geocoded_projects, orient='index', columns=['latitude', 'longitude'])
project_df.index.name = 'project_name_en'
project_df.reset_index().to_csv(project_cache_file, index=False)

print("Missing coordinates handled using project_name_en, and cache updated.")


Missing coordinates handled using project_name_en, and cache updated.


In [31]:
# Check for missing latitude and longitude
missing_lat_lon = sales_data[['latitude', 'longitude']].isnull().sum()

# Display the count of missing values
print("Missing Latitude and Longitude Counts:")
print(missing_lat_lon)

# Optionally, display the rows where either latitude or longitude is missing
missing_rows = sales_data[sales_data['latitude'].isnull() | sales_data['longitude'].isnull()]
print(f"Total rows with missing latitude or longitude: {len(missing_rows)}")

# If you want to see a sample of these rows
print(missing_rows.head())

Missing Latitude and Longitude Counts:
latitude     11390
longitude    11390
dtype: int64
Total rows with missing latitude or longitude: 11390
    transaction_type_id registration_type_en  property_usage_id     amount  \
47                    1             Off-Plan                  1  5339000.0   
48                    1             Off-Plan                  1  2995000.0   
49                    1             Off-Plan                  1  2711000.0   
50                    1             Off-Plan                  1  4346000.0   
75                    1             Off-Plan                  1  2582888.0   

    total_buyer  total_seller  transaction_size_sqm  property_size_sqm  \
47            1             1                165.35             165.35   
48            1             1                 87.91              87.91   
49            1             1                 74.13              74.13   
50            1             1                133.82             133.82   
75            2   

In [33]:
# Check for rows where latitude or longitude is still missing
missing_lat_lon_after_inference = geocoded_df[geocoded_df['latitude'].isnull() | geocoded_df['longitude'].isnull()]

# Get the count of unique area_en with missing coordinates
unique_missing_area_count = missing_lat_lon_after_inference['area_en'].nunique()

# Print the results
print(f"Number of area_en with missing latitude and longitude after inference: {unique_missing_area_count}")

# Optionally, display these area_en values
print("Areas with missing latitude and longitude:")
print(missing_lat_lon_after_inference['area_en'].unique())


Number of area_en with missing latitude and longitude after inference: 103
Areas with missing latitude and longitude:
['Madinat Dubai Almelaheyah' 'TECOM SITE A' 'ARABIAN RANCHES III'
 'Al Merkadh' 'SOBHA HEARTLAND' 'Um Suqaim Third' 'Al Hebiah Fourth'
 'Al Aweer First' 'Jabal Ali First' 'Nad Al Shiba First' 'Al Yelayiss 2'
 'Al Hebiah Fifth' 'DOWN TOWN JABAL ALI' 'Al Hebiah Sixth' 'Warsan First'
 'Jumeirah Third' 'Dubai Investment Park First' 'EMIRATE LIVING'
 'Al Mizhar First' 'Al Warqa First' 'JABEL ALI HILLS' 'Al Mizhar Second'
 'Zaabeel Second' 'Al Barsha Third' 'Al Goze First' 'Al Khawaneej First'
 'Muhaisanah First' 'Al Thanyah Fifth' 'Zaabeel First'
 'Nad Al Shiba Third' 'SAMA AL JADAF' 'Jumeirah Second' 'Eyal Nasser'
 'Al Barshaa South Second' 'LIWAN 2' 'Um Suqaim First'
 'POLO TOWNHOUSES IGO' 'Al Warqa Fourth' 'DUBAI WATER FRONT'
 'Al Saffa Second' 'Al Barshaa South Third' 'Nad Al Shiba Fourth'
 "Me'Aisem First" 'Al Mamzer' 'Al Barsha Second' 'Al Twar Fourth'
 'Um Suqaim Seco

In [34]:
# Group project_name_en by area_en
project_area_groups = sales_data.groupby('area_en')['project_name_en'].apply(list).reset_index()

# Rename columns for clarity
project_area_groups.columns = ['area_en', 'project_names']

# Display the grouped data
print(project_area_groups.head())

# Save to a file if needed
project_area_groups.to_csv("project_name_by_area_en.csv", index=False)


            area_en                                      project_names
0         AL Athbah  [Unknown, Unknown, Unknown, Unknown, Unknown, ...
1         AL BARARI  [Unknown, SEVENTH HEAVEN, Unknown, SEVENTH HEA...
2         AL FURJAN  [EQUITI ARCADE, Prime Residency 3, AVENUE RESI...
3  AL KHAIL HEIGHTS  [AL KHAIL HEIGHTS, AL KHAIL HEIGHTS, AL KHAIL ...
4           AL WAHA  [Unknown, Unknown, Unknown, Unknown, Unknown, ...


In [35]:
# Filter out rows where area_en or project_name_en is 'Unknown'
filtered_sales_data = sales_data[
    (sales_data['area_en'] != "Unknown") & (sales_data['project_name_en'] != "Unknown")
]

# Group project_name_en by area_en
project_area_groups = filtered_sales_data.groupby('area_en')['project_name_en'].apply(list).reset_index()

# Rename columns for clarity
project_area_groups.columns = ['area_en', 'project_names']

# Display the grouped data
print(project_area_groups.head())

# Save to a file if needed
project_area_groups.to_csv("project_name_by_area_en_filtered.csv", index=False)


             area_en                                      project_names
0          AL BARARI  [SEVENTH HEAVEN, SEVENTH HEAVEN, Seventh Heave...
1          AL FURJAN  [EQUITI ARCADE, Prime Residency 3, AVENUE RESI...
2   AL KHAIL HEIGHTS  [AL KHAIL HEIGHTS, AL KHAIL HEIGHTS, AL KHAIL ...
3            AL WAHA  [SIENNA  LAKES, SIENNA  LAKES, SIENNA  LAKES, ...
4  ARABIAN RANCHES I  [ASEEL, ASEEL, ASEEL, ALMA-2, ASEEL, ASEEL, AS...


In [36]:
# Identify area_en with missing latitude and longitude
missing_coords_areas = geocoded_df[geocoded_df['latitude'].isnull() & geocoded_df['longitude'].isnull()]['area_en']

# Filter project_area_groups for area_en with missing coordinates
missing_coords_projects = project_area_groups[project_area_groups['area_en'].isin(missing_coords_areas)]

# Display the filtered project names
print(missing_coords_projects)

                         area_en  \
6            ARABIAN RANCHES III   
9         Al Barshaa South Third   
10               Al Hebiah Fifth   
11              Al Hebiah Fourth   
12               Al Hebiah Sixth   
13               Al Hebiah Third   
15              Al Khairan First   
17                    Al Merkadh   
18   Al Qusais Industrial Fourth   
20              Al Thanyah Fifth   
21              Al Thanyah Third   
23                 Al Yelayiss 2   
35                      DMCC-EZ2   
36           DOWN TOWN JABAL ALI   
44   DUBAI INVESTMENT PARK FIRST   
55             DUBAI WATER FRONT   
56   Dubai Investment Park First   
59                EMIRATE LIVING   
79               Jabal Ali First   
81               Jumeirah Second   
85                       LIWAN 2   
98     Madinat Dubai Almelaheyah   
101               Me'Aisem First   
102              Me'Aisem Second   
105           Nad Al Shiba First   
110          POLO TOWNHOUSES IGO   
119              SOBHA HEART

In [39]:
missing_coords_projects.isnull().sum()

area_en          0
project_names    0
dtype: int64

In [40]:
missing_coords_projects

Unnamed: 0,area_en,project_names
6,ARABIAN RANCHES III,"[Arabian Ranches lll - Ruba, Arabian Ranches l..."
9,Al Barshaa South Third,"[ORION BUILDING, ORION BUILDING, ORION BUILDIN..."
10,Al Hebiah Fifth,"[DAMAC LAGOONS - MALTA (1), DAMAC LAGOONS - SA..."
11,Al Hebiah Fourth,"[Alicante villas, Alicante villas, Aura, Alica..."
12,Al Hebiah Sixth,"[Mudon Al Ranim 8, Mudon Al Ranim 8, Mudon Al ..."
13,Al Hebiah Third,"[DAMAC HILLS-BEL AIR, DAMAC HILLS - CAVALLI ES..."
15,Al Khairan First,"[Oria, Oria, Oria, Oria, Oria, Oria, Oria, Ori..."
17,Al Merkadh,"[Sobha Creek Vistas Heights, Sobha Creek Vista..."
18,Al Qusais Industrial Fourth,"[RASHA 2, RASHA 2]"
20,Al Thanyah Fifth,"[MERCER HOUSE, MERCER HOUSE, MERCER HOUSE, MER..."


In [41]:
from geopy.geocoders import Nominatim
import pandas as pd
import time

geolocator = Nominatim(user_agent="geoapi")
geocoded_projects = {}  # Cache for geocoded projects

def fetch_lat_long(address):
    """Fetch latitude and longitude for an address."""
    if address in geocoded_projects:
        return geocoded_projects[address]
    try:
        location = geolocator.geocode(address)
        if location:
            geocoded_projects[address] = (location.latitude, location.longitude)
            return location.latitude, location.longitude
    except Exception as e:
        pass
    geocoded_projects[address] = (None, None)
    return None, None

def infer_coords_from_projects(missing_coords_projects, geocoded_df):
    """Infer latitude and longitude for area_en using project_names."""
    updated_geocoded_df = geocoded_df.copy()
    for index, row in missing_coords_projects.iterrows():
        area = row['area_en']
        for project in row['project_names']:
            lat, lon = fetch_lat_long(project)
            if lat is not None and lon is not None:
                print(f"Found coordinates for {area} using project {project}: ({lat}, {lon})")
                updated_geocoded_df.loc[updated_geocoded_df['area_en'] == area, ['latitude', 'longitude']] = lat, lon
                break
        time.sleep(1)  # Avoid overloading the API
    return updated_geocoded_df

# Perform inference
updated_geocoded_df = infer_coords_from_projects(missing_coords_projects, geocoded_df)

# Save updated geocoded DataFrame
updated_geocoded_df.to_csv("updated_geocoded_areas.csv", index=False)

# Display remaining missing coordinates
remaining_missing = updated_geocoded_df[updated_geocoded_df['latitude'].isnull() & updated_geocoded_df['longitude'].isnull()]
print(f"Number of area_en with missing coordinates after inference: {len(remaining_missing)}")


Found coordinates for Al Barshaa South Third using project ORION BUILDING: (54.57111155, -1.2340866846418432)
Found coordinates for Al Hebiah Fourth using project Alicante villas: (-37.006392950000006, 174.89277654819733)
Found coordinates for Al Khairan First using project Oria: (37.4841014, -2.2924853)
Found coordinates for Al Qusais Industrial Fourth using project RASHA 2: (47.2093548, 39.73828236270582)
Found coordinates for Al Thanyah Fifth using project MERCER HOUSE: (53.3386883, -6.264935258768096)
Found coordinates for Al Thanyah Third using project The Lakes Ghadeer: (25.082474675775316, 55.17228870982094)
Found coordinates for Al Yelayiss 2 using project THE MAYFAIR : (1.34563295, 103.73337090343747)
Found coordinates for DOWN TOWN JABAL ALI using project Alexis Tower: (51.5181295, -0.078028)
Found coordinates for DUBAI INVESTMENT PARK FIRST using project Olivia Residences: (48.452858500000005, -123.36217172321923)
Found coordinates for DUBAI WATER FRONT using project BADRA P

In [42]:
from geopy.geocoders import Nominatim
import pandas as pd
import time

geolocator = Nominatim(user_agent="geoapi")
geocoded_landmarks = {}  # Cache for geocoded landmarks

def fetch_lat_long_landmark(address):
    """Fetch latitude and longitude for a landmark."""
    if address in geocoded_landmarks:
        return geocoded_landmarks[address]
    try:
        location = geolocator.geocode(address)
        if location:
            geocoded_landmarks[address] = (location.latitude, location.longitude)
            return location.latitude, location.longitude
    except Exception as e:
        pass
    geocoded_landmarks[address] = (None, None)
    return None, None

def infer_coords_from_landmarks(missing_coords_landmarks, geocoded_df):
    """Infer latitude and longitude for area_en using nearest_land_mark_en."""
    updated_geocoded_df = geocoded_df.copy()
    for index, row in missing_coords_landmarks.iterrows():
        area = row['area_en']
        for landmark in row['nearest_landmarks']:
            lat, lon = fetch_lat_long_landmark(landmark)
            if lat is not None and lon is not None:
                print(f"Found coordinates for {area} using landmark {landmark}: ({lat}, {lon})")
                updated_geocoded_df.loc[updated_geocoded_df['area_en'] == area, ['latitude', 'longitude']] = lat, lon
                break
        time.sleep(1)  # Avoid overloading the API
    return updated_geocoded_df

# Group nearest_land_mark_en by area_en for missing areas
missing_coords_landmarks = remaining_missing.merge(
    sales_data[['area_en', 'nearest_landmark_en']].drop_duplicates(),
    on='area_en',
)
missing_coords_landmarks = missing_coords_landmarks.groupby('area_en')['nearest_landmark_en'].apply(list).reset_index(name='nearest_landmarks')

# Perform inference
updated_geocoded_df_landmarks = infer_coords_from_landmarks(missing_coords_landmarks, updated_geocoded_df)

# Save updated geocoded DataFrame
updated_geocoded_df_landmarks.to_csv("final_geocoded_areas.csv", index=False)

# Display remaining missing coordinates
remaining_missing_landmarks = updated_geocoded_df_landmarks[
    updated_geocoded_df_landmarks['latitude'].isnull() & updated_geocoded_df_landmarks['longitude'].isnull()
]
print(f"Number of area_en with missing coordinates after inferring from landmarks: {len(remaining_missing_landmarks)}")


Found coordinates for ARABIAN RANCHES III using landmark Hamdan Sports Complex: (25.051092949999997, 55.31808411322436)
Found coordinates for Al Aweer First using landmark Unknown: (26.49253305, 92.33087891709363)
Found coordinates for Al Aweer Second using landmark Unknown: (26.49253305, 92.33087891709363)
Found coordinates for Al Barsha Second using landmark Burj Al Arab: (25.141327099999998, 55.18539672753495)
Found coordinates for Al Barsha Third using landmark Burj Al Arab: (25.141327099999998, 55.18539672753495)
Found coordinates for Al Barshaa South First using landmark Motor City: (38.733792, -120.7404881)
Found coordinates for Al Barshaa South Second using landmark Motor City: (38.733792, -120.7404881)
Found coordinates for Al Goze First using landmark Downtown Dubai: (25.19475815, 55.27805910442858)
Found coordinates for Al Goze Fourth using landmark Downtown Dubai: (25.19475815, 55.27805910442858)
Found coordinates for Al Goze Industrial First using landmark Burj Al Arab: (2

# Reset the Values of Unkown

In [43]:
# Step 1: Reset invalid coordinates derived from 'Unknown'
def reset_invalid_coordinates(df):
    invalid_coords = (df['latitude'] == 26.49253305) & (df['longitude'] == 92.33087891709363)
    df.loc[invalid_coords, ['latitude', 'longitude']] = None
    return df

updated_geocoded_df_landmarks = reset_invalid_coordinates(updated_geocoded_df_landmarks)

In [45]:
from geopy.geocoders import Nominatim
import pandas as pd
import time

# Geocoder
geolocator = Nominatim(user_agent="geoapi")
geocoded_landmarks = {}  # Cache for geocoded landmarks

def fetch_lat_long(address):
    """Fetch latitude and longitude for a given address."""
    if address in geocoded_landmarks:
        return geocoded_landmarks[address]
    try:
        location = geolocator.geocode(address)
        if location:
            geocoded_landmarks[address] = (location.latitude, location.longitude)
            return location.latitude, location.longitude
    except Exception as e:
        pass
    geocoded_landmarks[address] = (None, None)
    return None, None

def infer_coords_from_landmarks(missing_coords_landmarks, geocoded_df):
    """Infer latitude and longitude for area_en using nearest_landmark_en."""
    updated_geocoded_df = geocoded_df.copy()
    for index, row in missing_coords_landmarks.iterrows():
        area = row['area_en']
        for landmark in row['landmarks']:
            # Skip 'Unknown' landmarks
            if landmark == "Unknown":
                continue
            lat, lon = fetch_lat_long(landmark)
            if lat is not None and lon is not None:
                print(f"Found coordinates for {area} using landmark {landmark}: ({lat}, {lon})")
                updated_geocoded_df.loc[updated_geocoded_df['area_en'] == area, ['latitude', 'longitude']] = lat, lon
                break  # Move to the next area_en after finding coordinates
        time.sleep(1)  # Avoid overloading the API
    return updated_geocoded_df

# Step 1: Group landmarks by area_en
missing_coords_landmarks = remaining_missing_landmarks.merge(
    sales_data[['area_en', 'nearest_landmark_en']].drop_duplicates(),
    on='area_en',
)
missing_coords_landmarks = missing_coords_landmarks.groupby('area_en')['nearest_landmark_en'].apply(list).reset_index(name='landmarks')

# Step 2: Infer coordinates for areas with missing values
updated_geocoded_df_landmarks = infer_coords_from_landmarks(missing_coords_landmarks, updated_geocoded_df_landmarks)

# Step 3: Save the updated geocoded DataFrame
updated_geocoded_df_landmarks.to_csv("updated_geocoded_areas_landmarks.csv", index=False)

# Display remaining missing coordinates
remaining_missing_landmarks = updated_geocoded_df_landmarks[
    updated_geocoded_df_landmarks['latitude'].isnull() & updated_geocoded_df_landmarks['longitude'].isnull()
]
print(f"Number of area_en with missing coordinates after inferring from landmarks: {len(remaining_missing_landmarks)}")


Number of area_en with missing coordinates after inferring from landmarks: 34


In [46]:
remaining_missing_landmarks

Unnamed: 0,area_en,latitude,longitude
23,Al Merkadh,,
41,Al Aweer First,,
57,Al Hebiah Fifth,,
62,Al Hebiah Sixth,,
88,EMIRATE LIVING,,
117,Al Khawaneej First,,
131,Nad Al Shiba Third,,
147,Al Warqa Fourth,,
158,Nad Al Shiba Fourth,,
176,Al Rowaiyah Third,,


In [51]:
updated_geocoded_df_landmarks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area_en    273 non-null    object 
 1   latitude   239 non-null    float64
 2   longitude  239 non-null    float64
dtypes: float64(2), object(1)
memory usage: 6.5+ KB
