In [None]:
!pip install requests pandas

In [64]:
import pandas as pd
import requests
import time
import re

In [15]:
def get_coordinates(project_name):
    """Get coordinates from OneMap API for a given project name"""
    base_url = "https://www.onemap.gov.sg/api/common/elastic/search"
    headers = {"Authorization": "Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjo4OTgyLCJmb3JldmVyIjpmYWxzZSwiaXNzIjoiT25lTWFwIiwiaWF0IjoxNzU4NzA5MDY0LCJuYmYiOjE3NTg3MDkwNjQsImV4cCI6MTc1ODk2ODI2NCwianRpIjoiZDJiMGI0ZDktZjUwMi00NzlkLTg1MGQtNjMxZThkNTg1YWE0In0.Zx3mwVumXn7b06tPDpViZEwM_UPV3vH57T_F85v0RZL9bU3Pkr1SeHp2U2E0mgzWeRSO5e-lfT2PmHvw5Abn8E-X3V0brG5Ke9QJNjsaaKocOQuTXKoabS4_X2-GN7GkGPPr5-IFR5braFhTHzZfFmC2vwDwEP6IDYkURV8NuzjmT8yLX29l_gVkiQPxI1_3MPYahDT0sb1IXTRjAmP6R6RGVtVfnQTI1splQxcfNZguY3u4l441caafnoJo101kcFaXLAKo4d2V0EqoN1aKph92wHjbIkFioF-0d_8JGLgshMuHSm2KsI1IpruMnR-x7M1bKkYFSybszg7KuszN_A"}

    
    try:
        # Clean project name
        project_name = str(project_name).strip()
        
        # Parameters for the API call
        params = {
            'searchVal': project_name,
            'returnGeom': 'Y',
            'getAddrDetails': 'Y'
        }
        
        response = requests.get(base_url, params=params, headers=headers)
        
        if response.status_code == 200:
            data = response.json()
            if data['found'] > 0:
                result = data['results'][0]
                return float(result['LATITUDE']), float(result['LONGITUDE']), str(result['ADDRESS'])
        
        return None, None, None
        
    except Exception as e:
        print(f"Error processing {project_name}: {str(e)}")
        return None, None, None

In [None]:
# Get coordinates for each historical data district dataset
for district_num in range(25, 29):
    # Read the existing dataset
    df = pd.read_csv(f'../datasets/historical_datasets/district{district_num}.csv', encoding='latin1')

    # Create new columns for coordinates
    df['Latitude'] = None
    df['Longitude'] = None
    df['Full Address'] = None

    # Process each unique project name
    for project_name in df['Project Name'].unique():
        print(f"Processing: {project_name}")
        lat, lon, address = get_coordinates(project_name)
        
        # Update all rows with this project name
        if lat is not None and lon is not None:
            df.loc[df['Project Name'] == project_name, 'Latitude'] = lat
            df.loc[df['Project Name'] == project_name, 'Longitude'] = lon
            df.loc[df['Project Name'] == project_name, 'Full Address'] = address

        
        # Add delay to avoid hitting API rate limits
        time.sleep(1)

    # Save the updated dataset
    df.to_csv(f'../datasets/updated_coordinates/district{district_num}.csv', index=False)
    print("Updated dataset saved with coordinates")

In [65]:
def clean_property_name(name):
    # Convert to string in case of non-string input
    name = str(name)
    
    # Remove emojis and special characters
    name = re.sub(r'[^\x00-\x7F]+', '', name)  # Remove non-ASCII characters
    name = re.sub(r'[⭐★☆✨]+', '', name)  # Remove star symbols
    
    # Remove common advertising phrases
    ad_phrases = [
        r'!!!.*!!!',
        r'CHEAPER.*BAY',
        r'LOW ENTRY.*VIEWS',
        r'LUXURY LIVING.*',
        r'CHEAP.*',
        r'UNBLOCK.*VIEW.*',
        r'UNDERVALUED.*',
        r'Brand New Condos.*',
        r'Developer Sale.*',
        r'NEW Condo.*',
        r'^!!!.*',
        r'.*!!!$'
    ]
    
    for phrase in ad_phrases:
        name = re.sub(phrase, '', name, flags=re.IGNORECASE)
    
    # Remove leading/trailing special characters and whitespace
    name = re.sub(r'^[-!@#$%^&*(),.?":{}|<> ]+|[-!@#$%^&*(),.?":{}|<> ]+$', '', name)
    
    # Remove multiple spaces
    name = re.sub(r'\s+', ' ', name)
    
    # If name becomes empty after cleaning, return None
    if not name.strip():
        return None
        
    return name.strip()

In [None]:
# Get coordinates for each current listing district dataset
for district_num in range(1, 29):
    # Read the existing dataset
    df = pd.read_csv(f'../scrapers/data/Sep2025/condo-sales-{district_num}-Sep2025.csv', encoding='latin1')

    # Create new columns for coordinates
    df['Latitude'] = None
    df['Longitude'] = None
    df['Full Address'] = None

    # Process each unique project name
    for project_name in df['PropertyName'].unique():
        project_name = clean_property_name(project_name)
        if project_name is None:
            continue
        print(f"Processing: {project_name}")
        lat, lon, address = get_coordinates(project_name)
        
        # Update all rows with this project name
        if lat is not None and lon is not None:
            df.loc[df['PropertyName'] == project_name, 'Latitude'] = lat
            df.loc[df['PropertyName'] == project_name, 'Longitude'] = lon
            df.loc[df['PropertyName'] == project_name, 'Full Address'] = address

        # Add delay to avoid hitting API rate limits
        time.sleep(1)

    # Save the updated dataset
    df.to_csv(f'../scrapers/data/Sep2025/updated_coordinates/district{district_num}_current_listings.csv', index=False)

In [None]:
def get_nearest_mrt_stations(project_name, latitude, longitude):
    """Get coordinates from OneMap API for a given project name"""
    base_url = "https://www.onemap.gov.sg/api/public/nearbysvc/getNearestMrtStops"
    headers = {"Authorization": "Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjo4OTgyLCJmb3JldmVyIjpmYWxzZSwiaXNzIjoiT25lTWFwIiwiaWF0IjoxNzU4NzA5MDY0LCJuYmYiOjE3NTg3MDkwNjQsImV4cCI6MTc1ODk2ODI2NCwianRpIjoiZDJiMGI0ZDktZjUwMi00NzlkLTg1MGQtNjMxZThkNTg1YWE0In0.Zx3mwVumXn7b06tPDpViZEwM_UPV3vH57T_F85v0RZL9bU3Pkr1SeHp2U2E0mgzWeRSO5e-lfT2PmHvw5Abn8E-X3V0brG5Ke9QJNjsaaKocOQuTXKoabS4_X2-GN7GkGPPr5-IFR5braFhTHzZfFmC2vwDwEP6IDYkURV8NuzjmT8yLX29l_gVkiQPxI1_3MPYahDT0sb1IXTRjAmP6R6RGVtVfnQTI1splQxcfNZguY3u4l441caafnoJo101kcFaXLAKo4d2V0EqoN1aKph92wHjbIkFioF-0d_8JGLgshMuHSm2KsI1IpruMnR-x7M1bKkYFSybszg7KuszN_A"}

    
    try:
        # Parameters for the API call
        params = {
            'latitude': latitude,
            'longitude': longitude,
            'radius_in_meters': '1000'
        }
        
        response = requests.get(base_url, params=params, headers=headers)
        
        stations = []
        
        if response.status_code == 200:
            data = response.json()
            if data:
                # Extract station names and distances
                stations = [station['name'] for station in data]
                return "; ".join(stations)
            else:
                return None
        
        return f"API Error: {response.status_code}"
        
    except Exception as e:
        print(f"Error processing {project_name}: {str(e)}")
        return None

In [None]:
for district_num in range(15, 29):
    try:        
        # Read the updated dataset
        df = pd.read_csv(f'../datasets/updated_coordinates/district{district_num}.csv')

        # Filter out rows without coordinates
        df = df.dropna(subset=['Latitude', 'Longitude'])

        # Create new column for MRT stations if it doesn't exist
        if 'Nearest MRT Stations' not in df.columns:
            df['Nearest MRT Stations'] = None

        # Process each unique project
        for project_name in df['Project Name'].unique():
            try:
                # Get the first row for this project (assuming coordinates are same for same project)
                project_row = df[df['Project Name'] == project_name].iloc[0]
                
                print(f"Processing: {project_name}")
                                
                # Get nearest MRT stations
                mrt_stations = get_nearest_mrt_stations(
                    project_name,
                    str(project_row['Latitude']),  # Convert to string
                    str(project_row['Longitude'])  # Convert to string
                )
                
                # Update all rows for this project
                df.loc[df['Project Name'] == project_name, 'Nearest MRT Stations'] = mrt_stations
                
                # Add delay to avoid hitting API rate limits
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error processing {project_name} in district {district_num}: {str(e)}")
                continue

        # Save the updated dataset
        df.to_csv(f'../datasets/updated_coordinates/district{district_num}.csv', index=False)
        print(f"Completed district {district_num}")
        
    except Exception as e:
        print(f"Error processing district {district_num}: {str(e)}")
        continue