In [1]:
import pandas as pd

In [2]:
# Load the datasets
original_sales_data = pd.read_csv('../data/snp_dld_2024_transactions.csv', low_memory=False)
original_rentals_data = pd.read_csv('../data/snp_dld_2024_rents.csv', low_memory=False)
# Load the processed dfs
sales_data = pd.read_csv('../data/sales_data_no_missing.csv', low_memory=False)
rentals_data = pd.read_csv('../data/rentals_data_no_missing.csv', low_memory=False)

In [3]:
original_sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162806 entries, 0 to 162805
Data columns (total 46 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   transaction_number      162806 non-null  object 
 1   transaction_datetime    162806 non-null  object 
 2   transaction_type_en     162806 non-null  object 
 3   transaction_type_id     162806 non-null  int64  
 4   transaction_subtype_en  162806 non-null  object 
 5   transaction_subtype_id  162806 non-null  int64  
 6   registration_type_en    162806 non-null  object 
 7   is_freehold_text        162806 non-null  object 
 8   property_usage_en       162806 non-null  object 
 9   property_usage_id       162806 non-null  int64  
 10  amount                  162806 non-null  float64
 11  total_buyer             162806 non-null  int64  
 12  total_seller            162806 non-null  int64  
 13  transaction_size_sqm    162146 non-null  float64
 14  property_size_sqm   

In [4]:
location_columns_sales = [
    "project_name_en",
    "project_name_ar",
    "area_en",
    "area_ar",
    "area_id",
    "nearest_landmark_en",
    "nearest_landmark_ar",
    "nearest_metro_en",
    "nearest_metro_ar",
    "nearest_mall_en",
    "nearest_mall_ar",
    "master_project_en",
    "master_project_ar"
]


In [8]:
location_data_sales = original_sales_data[location_columns_sales]

In [19]:
aarea_group = location_data_sales.groupby('area_en').agg({
    'project_name_en': 'count',        # Non-null project names
    'nearest_landmark_en': 'count',   # Non-null nearest landmarks
    "nearest_metro_en": 'count',      # Non-null nearest metro entries
    "nearest_mall_en": 'count',       # Non-null nearest mall entries
}).reset_index()

# Rename columns for clarity
area_group.rename(columns={
    'project_name_en': 'project_count',
    'nearest_landmark_en': 'landmark_count',
    'nearest_metro_en': 'metro_count',
    'nearest_mall_en': 'mall_count'
}, inplace=True)



In [21]:
area_group.tail()

Unnamed: 0,area_en,project_count,landmark_count,metro_count,mall_count
268,Wadi Al Safa 7,1311,304,0,0
269,Warsan First,68,48,72,72
270,Warsan Fourth,0,0,0,0
271,Zaabeel First,1795,1729,1729,1729
272,Zaabeel Second,398,398,398,398


In [22]:
from geopy.geocoders import Nominatim

# Initialize the geolocator
geolocator = Nominatim(user_agent="geoapi")

# Arabic address or place name
location = geolocator.geocode("دبي")

In [23]:
location

Location(دبي, الإمارات العربية المتحدة, (25.2653471, 55.2924914, 0.0))

In [25]:
df = location_data_sales

In [27]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import time
import logging
import os
from tqdm import tqdm  # For progress bar

# Initialize logging
logging.basicConfig(filename="geocoding_errors.log", level=logging.ERROR)

# Initialize the geolocator with a higher timeout
geolocator = Nominatim(user_agent="geoapi", timeout=10)

# Define Dubai's approximate coordinates
dubai_coordinates = (25.276987, 55.296249)

# Cache to avoid redundant requests
coordinate_cache = {}

# Function to check if coordinates are within Dubai
def is_within_dubai(lat, lon, dubai_center=dubai_coordinates, max_distance_km=50):
    """Check if a location is within a reasonable distance of Dubai."""
    if lat is None or lon is None:
        return False
    distance = geodesic((lat, lon), dubai_center).km
    return distance <= max_distance_km

# Function to get coordinates with caching, retries, and rate limiting
def get_coordinates(location):
    """Fetch coordinates for a given location."""
    if location in coordinate_cache:
        return coordinate_cache[location]  # Return from cache if available

    try:
        time.sleep(1)  # Respect rate limit (1 request per second)
        loc = geolocator.geocode(location)
        if loc:
            lat, lon = loc.latitude, loc.longitude
            coordinate_cache[location] = (lat, lon)  # Save to cache
            return lat, lon
    except Exception as e:
        logging.error(f"Error fetching coordinates for {location}: {e}")
    return None, None

# Function to fetch coordinates for an area with fallback logic
def fetch_coordinates_for_area(row):
    """Fetch coordinates for an area using multiple fallback fields."""
    fields = [
        'area_en', 'area_ar', 'project_name_en', 'project_name_ar',
        'nearest_landmark_en', 'nearest_landmark_ar', 
        'nearest_metro_en', 'nearest_metro_ar', 
        'nearest_mall_en', 'nearest_mall_ar', 
        'master_project_en', 'master_project_ar'
    ]
    
    for field in fields:
        location = row.get(field)
        if pd.notnull(location):  # Only process non-null locations
            lat, lon = get_coordinates(location)
            if is_within_dubai(lat, lon):  # Validate if coordinates are in Dubai
                return lat, lon

    # Return None if no valid coordinates found
    return None, None

# Check if a saved CSV exists for coordinates
coordinates_csv = "area_coordinates.csv"
if os.path.exists(coordinates_csv):
    # Load cached coordinates from CSV
    print(f"Loading cached coordinates from {coordinates_csv}...")
    unique_areas = pd.read_csv(coordinates_csv)
    coordinate_cache = {
        row['area_en']: (row['latitude'], row['longitude'])
        for _, row in unique_areas.iterrows()
    }
else:
    # Preprocess to get unique areas
    unique_areas = df.groupby('area_en').first().reset_index()

    # Apply geocoding to unique areas with a progress bar
    tqdm.pandas(desc="Geocoding Areas")  # Add description to progress bar
    unique_areas[['latitude', 'longitude']] = unique_areas.progress_apply(
        fetch_coordinates_for_area, axis=1, result_type='expand'
    )

    # Save unique area coordinates to CSV for future runs
    unique_areas.to_csv(coordinates_csv, index=False)
    print(f"Saved coordinates to {coordinates_csv}.")

# Merge the coordinates back to the original dataset
df = pd.merge(df, unique_areas[['area_en', 'latitude', 'longitude']], on='area_en', how='left')

# Save the full dataset with coordinates to a CSV
final_csv = "dataset_with_coordinates.csv"
df.to_csv(final_csv, index=False)
print(f"Saved the dataset with coordinates to {final_csv}.")



Geocoding Areas: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 273/273 [1:01:41<00:00, 13.56s/it]


Saved coordinates to area_coordinates.csv.
Saved the dataset with coordinates to dataset_with_coordinates.csv.


In [28]:
coordinate_cache

{}

In [29]:
# Check if each area_en has coordinates
unique_areas['has_coordinates'] = unique_areas[['latitude', 'longitude']].notnull().all(axis=1)

# Print areas without coordinates
missing_coordinates = unique_areas[~unique_areas['has_coordinates']]
if not missing_coordinates.empty:
    print(f"Areas without coordinates:\n{missing_coordinates[['area_en']]}")
else:
    print("All areas have coordinates!")

Areas without coordinates:
              area_en
0           AL Athbah
1           AL BARARI
2           AL FURJAN
3    AL KHAIL HEIGHTS
4             AL WAHA
..                ...
268    Wadi Al Safa 7
269      Warsan First
270     Warsan Fourth
271     Zaabeel First
272    Zaabeel Second

[273 rows x 1 columns]


In [30]:
missing_coordinates[['area_en']].isnull()

Unnamed: 0,area_en
0,False
1,False
2,False
3,False
4,False
...,...
268,False
269,False
270,False
271,False


In [31]:
unique_area = location_data_sales['area_en'].unique()

In [32]:
unique_areas

Unnamed: 0,area_en,project_name_en,project_name_ar,area_ar,area_id,nearest_landmark_en,nearest_landmark_ar,nearest_metro_en,nearest_metro_ar,nearest_mall_en,nearest_mall_ar,master_project_en,master_project_ar,latitude,longitude,has_coordinates
0,AL Athbah,,,العذبة,0,,,,,,,,,,,False
1,AL BARARI,SEVENTH HEAVEN,سفنث هفن,البراري,0,IMG World Adventures,آي إم جي وورلد أدفينتشرز,Noor Bank Metro Station,محطة مترو نور بنك,,,,,,,False
2,AL FURJAN,EQUITI ARCADE,ايكويتي اركيد,الفرجان,0,Expo 2020 Site,موقع إكسبو 2020,Ibn Battuta Metro Station,محطة مترو ابن بطوطة,Ibn-e-Battuta Mall,ابن بطوطة مول,Jebel Ali Village Townhouses- Phase 1,قرية جبل علي تاون هاوسز- المرحلة 1,,,False
3,AL KHAIL HEIGHTS,AL KHAIL HEIGHTS,الخيل هايتس,الخيل هايتس,0,Downtown Dubai,وسط مدينة دبي,Noor Bank Metro Station,محطة مترو نور بنك,Dubai Mall,مول دبي,,,,,False
4,AL WAHA,SIENNA LAKES,سينا ليكس,الواحة,0,Dubai Cycling Course,دورة دبي للدراجات,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,Wadi Al Safa 7,Rukan 3,روكان 3,وادي الصفا 7,0,Hamdan Sports Complex,مجمع حمدان الرياضي,,,,,,,,,False
269,Warsan First,Olivz Residence,اوليفز رزيدنس,ورسان الاولى,0,Dubai International Airport,مطار دبي الدولي,Rashidiya Metro Station,محطة مترو الراشدية,City Centre Mirdif,سيتي سنتر مردف,,,,,False
270,Warsan Fourth,,,ورسان الرابعة,0,,,,,,,,,,,False
271,Zaabeel First,One Za'abeel,ون زعبيل,زعبيل الاولى,0,Burj Khalifa,برج خليفة,Al Jafiliya Metro Station,محطة مترو الجافلية,Dubai Mall,مول دبي,,,,,False


In [33]:
import pandas as pd
import os
from geopy.geocoders import Nominatim
from concurrent.futures import ThreadPoolExecutor
import time

# Initialize geopy geolocator
geolocator = Nominatim(user_agent="geoapi")
geocoded_locations = {}  # Cache for geocoded locations

# Define Dubai's geographic coordinate range (expanded for tolerance)
DUBAI_LAT_RANGE = (24.0, 26.5)
DUBAI_LON_RANGE = (54.0, 56.0)

def is_within_dubai(lat, lon):
    """Check if coordinates are within Dubai."""
    return DUBAI_LAT_RANGE[0] <= lat <= DUBAI_LAT_RANGE[1] and DUBAI_LON_RANGE[0] <= lon <= DUBAI_LON_RANGE[1]

def fetch_lat_long(location_name):
    """Fetch latitude and longitude for a location."""
    if location_name in geocoded_locations:
        return geocoded_locations[location_name]
    try:
        location = geolocator.geocode(location_name)
        if location:
            geocoded_locations[location_name] = (location.latitude, location.longitude)
            return location.latitude, location.longitude
    except Exception as e:
        pass
    geocoded_locations[location_name] = (None, None)
    return None, None

def process_area(area_name):
    """Fetch coordinates for an area."""
    if area_name == "Unknown" or pd.isna(area_name):
        return area_name, None, None

    lat, lon = fetch_lat_long(area_name)
    if lat is not None and lon is not None and is_within_dubai(lat, lon):
        print(f"Found valid coordinates for {area_name}: ({lat}, {lon})")
        return area_name, lat, lon

    print(f"No valid coordinates found for {area_name}")
    return area_name, None, None

def geocode_areas(dataframe):
    """Geocode all areas using multithreading."""
    areas = dataframe['area_en'].unique()
    print(f"Number of unique areas to geocode: {len(areas)}")

    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(process_area, areas))

    geocoded_df = pd.DataFrame(results, columns=['area_en', 'latitude', 'longitude'])
    return geocoded_df

# Main Logic
geocoded_file = "geocoded_area_new.csv"

if os.path.exists(geocoded_file):
    print(f"Loading existing geocoded file: {geocoded_file}")
    geocoded_df = pd.read_csv(geocoded_file)
else:
    # Assuming sales_data is your main dataset; update this if different
    sales_data = pd.read_csv("../data/snp_dld_2024_transactions.csv")  # Replace with the actual path to your dataset
    geocoded_df = geocode_areas(sales_data)

    # Save the geocoded areas
    geocoded_df.to_csv(geocoded_file, index=False)
    print(f"Geocoded data saved to {geocoded_file}")

# Remaining missing coordinates
remaining_missing = geocoded_df[geocoded_df['latitude'].isnull() & geocoded_df['longitude'].isnull()]
print(f"\nNumber of areas with missing coordinates: {len(remaining_missing)}")


Number of unique areas to geocode: 273
No valid coordinates found for SILICON OASIS
Found valid coordinates for DUBAI MARITIME CITY: (25.262151, 55.26537042587559)
Found valid coordinates for DUBAI CREEK HARBOUR: (25.197978, 55.36037960558315)
Found valid coordinates for JUMEIRAH VILLAGE CIRCLE: (25.0527521, 55.205766331712184)
Found valid coordinates for Al Wasl: (25.1959326, 55.2557371)
No valid coordinates found for MEYDAN ONE
No valid coordinates found for Wadi Al Safa 4
No valid coordinates found for SUFOUH GARDENS
No valid coordinates found for Madinat Al Mataar
No valid coordinates found for MAJAN
No valid coordinates found for TOWN SQUARE
No valid coordinates found for Al Kifaf
No valid coordinates found for BUSINESS BAY
No valid coordinates found for Palm Deira
No valid coordinates found for ARABIAN RANCHES III
No valid coordinates found for Al Merkadh
No valid coordinates found for Al Yufrah 1
No valid coordinates found for SOBHA HEARTLAND
No valid coordinates found for Um Su

In [36]:
import pandas as pd
from geopy.geocoders import Nominatim
import time

# Initialize geopy geolocator
geolocator = Nominatim(user_agent="geoapi", timeout=10)

# Define Dubai's geographic coordinate range
DUBAI_LAT_RANGE = (24.0, 26.5)
DUBAI_LON_RANGE = (54.0, 56.0)

def is_within_dubai(lat, lon):
    """Check if coordinates are within Dubai."""
    return DUBAI_LAT_RANGE[0] <= lat <= DUBAI_LAT_RANGE[1] and DUBAI_LON_RANGE[0] <= lon <= DUBAI_LON_RANGE[1]

def fetch_lat_long(location_name):
    """Fetch latitude and longitude for a location."""
    try:
        location = geolocator.geocode(location_name)
        if location:
            return location.latitude, location.longitude
    except Exception as e:
        print(f"Error fetching {location_name}: {e}")
    return None, None

def build_area_dict(dataframe):
    """Build a dictionary with area_en as keys and other location-related attributes as values."""
    grouped = dataframe.groupby('area_en').agg({
        'area_ar': lambda x: list(x.unique()),
        'project_name_en': lambda x: list(x.unique()),
        'project_name_ar': lambda x: list(x.unique()),
        'nearest_landmark_en': lambda x: list(x.unique()),
        'nearest_landmark_ar': lambda x: list(x.unique()),
        'nearest_metro_en': lambda x: list(x.unique()),
        'nearest_metro_ar': lambda x: list(x.unique()),
        'nearest_mall_en': lambda x: list(x.unique()),
        'nearest_mall_ar': lambda x: list(x.unique()),
        'master_project_en': lambda x: list(x.unique()),
        'master_project_ar': lambda x: list(x.unique()),
    }).reset_index()
    
    area_dict = {
        row['area_en']: (
            row['area_ar'], 
            row['project_name_en'], row['project_name_ar'], 
            row['nearest_landmark_en'], row['nearest_landmark_ar'], 
            row['nearest_metro_en'], row['nearest_metro_ar'], 
            row['nearest_mall_en'], row['nearest_mall_ar'], 
            row['master_project_en'], row['master_project_ar']
        )
        for _, row in grouped.iterrows()
    }
    return area_dict

def infer_coordinates(area_dict):
    """Infer coordinates for each area_en entry."""
    geocoded_results = {}
    
    for area_en, attributes in area_dict.items():
        if area_en == "Unknown":
            continue  # Skip 'Unknown' entries
        
        print(f"Processing {area_en}...")
        # Step 1: Try area_en
        lat, lon = fetch_lat_long(area_en)
        if lat and lon and is_within_dubai(lat, lon):
            geocoded_results[area_en] = (lat, lon)
            print(f"Found valid coordinates for {area_en} using area_en: ({lat}, {lon})")
            continue
        
        # Step 2: Try area_ar
        for area_ar in attributes[0]:
            lat, lon = fetch_lat_long(area_ar)
            if lat and lon and is_within_dubai(lat, lon):
                geocoded_results[area_en] = (lat, lon)
                print(f"Found valid coordinates for {area_en} using area_ar: ({lat}, {lon})")
                break
        if area_en in geocoded_results:
            continue

        # Step 3: Try project_name_en
        for project in attributes[1]:
            lat, lon = fetch_lat_long(project)
            if lat and lon and is_within_dubai(lat, lon):
                geocoded_results[area_en] = (lat, lon)
                print(f"Found valid coordinates for {area_en} using project_name_en: ({lat}, {lon})")
                break
        if area_en in geocoded_results:
            continue

        # Step 4: Try project_name_ar
        for project in attributes[2]:
            lat, lon = fetch_lat_long(project)
            if lat and lon and is_within_dubai(lat, lon):
                geocoded_results[area_en] = (lat, lon)
                print(f"Found valid coordinates for {area_en} using project_name_ar: ({lat}, {lon})")
                break
        if area_en in geocoded_results:
            continue

        # Step 5: Try nearest_landmark_en
        for landmark in attributes[3]:
            lat, lon = fetch_lat_long(landmark)
            if lat and lon and is_within_dubai(lat, lon):
                geocoded_results[area_en] = (lat, lon)
                print(f"Found valid coordinates for {area_en} using nearest_landmark_en: ({lat}, {lon})")
                break
        if area_en in geocoded_results:
            continue

        # Step 6: Try nearest_landmark_ar
        for landmark in attributes[4]:
            lat, lon = fetch_lat_long(landmark)
            if lat and lon and is_within_dubai(lat, lon):
                geocoded_results[area_en] = (lat, lon)
                print(f"Found valid coordinates for {area_en} using nearest_landmark_ar: ({lat}, {lon})")
                break
        if area_en in geocoded_results:
            continue

        # Step 7: Try other location fields (nearest_metro, mall, etc.)
        for field in attributes[5:]:
            for location in field:
                lat, lon = fetch_lat_long(location)
                if lat and lon and is_within_dubai(lat, lon):
                    geocoded_results[area_en] = (lat, lon)
                    print(f"Found valid coordinates for {area_en} using {location}: ({lat}, {lon})")
                    break
            if area_en in geocoded_results:
                break
        
        if area_en not in geocoded_results:
            print(f"Could not find valid coordinates for {area_en}")
        
        # Respect API rate limits
        time.sleep(1)

    return geocoded_results

# Main logic
# Load your dataset
sales_data = pd.read_csv("../data/snp_dld_2024_transactions.csv")  # Replace with your dataset's path
area_dict = build_area_dict(sales_data)
geocoded_results = infer_coordinates(area_dict)

# Save the results
geocoded_df = pd.DataFrame.from_dict(geocoded_results, orient='index', columns=['latitude', 'longitude'])
geocoded_df.reset_index(inplace=True)
geocoded_df.rename(columns={'index': 'area_en'}, inplace=True)
geocoded_df.to_csv("geocoded_area_new.csv", index=False)
print("Geocoding completed and saved to geocoded_area_new.csv")


Processing AL Athbah...
Could not find valid coordinates for AL Athbah
Processing AL BARARI...
Found valid coordinates for AL BARARI using area_en: (25.118865, 55.3209685)
Processing AL FURJAN...
Found valid coordinates for AL FURJAN using area_en: (25.0304736, 55.1522307)
Processing AL KHAIL HEIGHTS...
Found valid coordinates for AL KHAIL HEIGHTS using area_en: (25.155860150000002, 55.2536875439334)
Processing AL WAHA...
Could not find valid coordinates for AL WAHA
Processing ARABIAN RANCHES I...
Found valid coordinates for ARABIAN RANCHES I using area_en: (25.04875135, 55.267288518151844)
Processing ARABIAN RANCHES II...
Found valid coordinates for ARABIAN RANCHES II using area_en: (25.03439685, 55.27332788354515)
Processing ARABIAN RANCHES III...
Found valid coordinates for ARABIAN RANCHES III using nearest_landmark_en: (25.051092949999997, 55.31808411322436)
Processing ARABIAN RANCHES POLO CLUB...
Found valid coordinates for ARABIAN RANCHES POLO CLUB using area_en: (25.037414249999

In [40]:
geocoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area_en    247 non-null    object 
 1   latitude   247 non-null    float64
 2   longitude  247 non-null    float64
dtypes: float64(2), object(1)
memory usage: 5.9+ KB


In [41]:
# Assuming 'sales_data' is the original dataset with all area_en values
# Get all unique area_en values from the original dataset
all_area_en = sales_data['area_en'].unique()

# Get area_en values present in geocoded_df
geocoded_areas = geocoded_df['area_en'].unique()

# Find missing area_en
missing_areas = set(all_area_en) - set(geocoded_areas)

# Print missing area_en names
print("Areas with missing coordinates:", missing_areas)

# Optionally convert to a list if needed
missing_area_list = list(missing_areas)


Areas with missing coordinates: {'Al Hebiah Second', 'Al Warsan First', 'LIWAN 2', 'Al Warsan Third', 'Dubai Investment Park First', 'Al Hebiah Fourth', 'Hessyan Second', 'THE FIELD', 'Warsan Fourth', 'Al Hebiah Sixth', 'AL Athbah', 'Al Thanyah Fifth', 'AL WAHA', 'Al Aweer Second', 'Lehbab First', 'Hessyan First', 'LIWAN', 'THE VALLEY', 'Lehbab Second', "Me'Aisem Second", 'Al Hebiah Third', 'Al Aweer First', 'Madinat Hind 3', 'Al Rowaiyah Third', 'Al Hebiah Fifth', 'Muragab'}


In [42]:
len(missing_area_list)

26

In [43]:
geocoded_df.head()

Unnamed: 0,area_en,latitude,longitude
0,AL BARARI,25.118865,55.320968
1,AL FURJAN,25.030474,55.152231
2,AL KHAIL HEIGHTS,25.15586,55.253688
3,ARABIAN RANCHES I,25.048751,55.267289
4,ARABIAN RANCHES II,25.034397,55.273328


In [44]:
# List of missing areas
missing_area_list = [
    'Al Hebiah Second', 'Al Warsan First', 'LIWAN 2', 'Al Warsan Third', 
    'Dubai Investment Park First', 'Al Hebiah Fourth', 'Hessyan Second', 
    'THE FIELD', 'Warsan Fourth', 'Al Hebiah Sixth', 'AL Athbah', 
    'Al Thanyah Fifth', 'AL WAHA', 'Al Aweer Second', 'Lehbab First', 
    'Hessyan First', 'LIWAN', 'THE VALLEY', 'Lehbab Second', 
    "Me'Aisem Second", 'Al Hebiah Third', 'Al Aweer First', 
    'Madinat Hind 3', 'Al Rowaiyah Third', 'Al Hebiah Fifth', 'Muragab'
]

# Create a DataFrame structure for manual entry
missing_coords_df = pd.DataFrame({
    'area_en': missing_area_list,
    'latitude': [None] * len(missing_area_list),  # Placeholder for latitude
    'longitude': [None] * len(missing_area_list)  # Placeholder for longitude
})

# Save the DataFrame to a CSV file for manual entry if needed
missing_coords_df.to_csv('missing_coords_manual.csv', index=False)

# Display the structure
print(missing_coords_df.head())

                       area_en latitude longitude
0             Al Hebiah Second     None      None
1              Al Warsan First     None      None
2                      LIWAN 2     None      None
3              Al Warsan Third     None      None
4  Dubai Investment Park First     None      None


In [45]:
manual_coords_df = pd.read_csv('missing_coords_manual.csv')
manual_coords_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area_en    26 non-null     object 
 1   latitude   26 non-null     float64
 2   longitude  26 non-null     float64
dtypes: float64(2), object(1)
memory usage: 752.0+ bytes


In [46]:
# Merge the datasets
combined_geocoded_df = pd.concat([geocoded_df, manual_coords_df], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_geocoded_df.to_csv("combined_geocoded_areas.csv", index=False)

In [47]:
combined_geocoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area_en    273 non-null    object 
 1   latitude   273 non-null    float64
 2   longitude  273 non-null    float64
dtypes: float64(2), object(1)
memory usage: 6.5+ KB
