### **Task 1: Web Scraping**

**Stores**: Starbucks, McDonald's, CU  
**Output**: A CSV file with columns: Brand, Store Name, Latitude, Longitude, Address

<u>References</u>
1. https://www.selenium.dev/documentation/webdriver/elements/locators/
2. https://github.com/googlemaps/google-maps-services-python
3. https://developers.google.com/maps/documentation/geocoding/start
4. https://www.geeksforgeeks.org/response-json-python-requests/ 

In [1]:
import os
import time
import requests
import googlemaps
import pandas as pd
from dotenv import load_dotenv
from selenium import webdriver
import undetected_chromedriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

##### **1.1 Define Functions**

##### **1.1.1 Utilities**

In [2]:
# Function to get coordinates via Geocoding API
def get_coordinates(address, client):
    try:
        geocode_result = client.geocode(address)
        if geocode_result:
            coordinate = geocode_result[0]['geometry']['location']
            return pd.Series([coordinate['lat'], coordinate['lng']])
        else:
            return pd.Series([None, None])
    except Exception as e:
        print(f"Error for {address}: {e}")
        return pd.Series([None, None])

##### **1.1.2 McDonald's**
**Steps:**
1. Get store name and address from official website (https://www.mcdonalds.com.my/locate-us)
2. Get store coordinate using Geocoding API

In [3]:
def scrap_mcd(url, gmap):
    df = pd.DataFrame(columns=['Brand', 'StoreName', 'Latitude', 'Longitude', 'Address'])

    # Step 1
    driver = webdriver.Chrome()
    driver.get(url)

    store_containers = driver.find_elements(By.CSS_SELECTOR, 'div.addressTop') 
    store_data = []

    for container in store_containers:
        try:
            store_name_element = container.find_element(By.CSS_SELECTOR, 'a.addressTitle strong')
            store_name = store_name_element.text.strip()

            # ⚠️ IMPORTANT: There are multiple <p class="addressText"> elements in each container.
            # Ensure that the first one contains the store address before running this line.
            # Else, you may need to modify the code to extract the correct address element.
            address_element = container.find_element(By.CSS_SELECTOR, 'p.addressText') 
            address = address_element.text.strip()

            store_data.append({"Brand": "McDonald's", "StoreName": store_name, "Address": address})
        except:
            continue  

    df_temp = pd.DataFrame(store_data)
    df = pd.concat([df, df_temp], ignore_index=True)

    driver.quit()

    # Step 2
    df[['Latitude', 'Longitude']] = df['Address'].apply(get_coordinates, client=gmap)

    return df

##### **1.1.3 Starbucks**
**Steps:**
1. Get store name and address from official website (https://www.starbucks.com.my/find-store)
2. Get store coordinate using Geocoding API

**Note:**  

To bypass Cloudflare detection, the scraping function mimics human behavior by following these steps:
1. Open the Google search page (https://www.google.com)
2. Simulate typing the target URL (https://www.starbucks.com.my/find-store) into the search bar
3. Click on the first search result
4. Click to accept the cookie's agreement
5. Extract the required data
Additionally, use a pre-configured Chrome profile for a more natural browsing experience

In [4]:
def scrap_stb(url, gmap, profile_dir, profile='Default'):
    df = pd.DataFrame(columns=['Brand', 'StoreName', 'Latitude', 'Longitude', 'Address']) 

    # Step 1
    options = webdriver.ChromeOptions()
    options.add_argument(f"--user-data-dir={profile_dir}")
    options.add_argument(f"--profile-directory={profile}")

    driver = webdriver.Chrome(options=options, headless=False)
    driver.get(r'https://www.google.com')

    textarea = driver.find_element(By.CSS_SELECTOR, 'textarea.gLFyf')
    textarea.send_keys(url)
    textarea.send_keys(Keys.RETURN)

    stb_link = driver.find_element(By.CSS_SELECTOR, 'a.zReHs')
    stb_link.click()

    time.sleep(10) # Wait for human verification

    agree_btn = driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-secondary')
    agree_btn.click()

    time.sleep(10) # Wait for full load  

    store_data = []

    # Html of the first selected store is different, need to handle individually
    store_1 = driver.find_element(By.CSS_SELECTOR, 'div.list-group-item.pointer.list-group-item-action.py-3.lh-tight.store.active2')
    store1_name_element = store_1.find_element(By.CSS_SELECTOR, 'strong.mb-1.store_name')
    store1_name = store1_name_element.text.strip()
    address1_element = store_1.find_element(By.CSS_SELECTOR, 'div.col-10.mb-1.store_address.small')
    address1 = address1_element.text.strip()
    store_data.append({"Brand": "Starbucks", "StoreName": store1_name, "Address": address1})

    # The rest of the stores
    store_containers = driver.find_elements(By.CSS_SELECTOR, 'div.list-group-item.pointer.list-group-item-action.py-3.lh-tight.store')
    for store in store_containers:
        try:
            store_name_element = store.find_element(By.CSS_SELECTOR, 'strong.mb-1.store_name')
            store_name = store_name_element.text.strip()
            address_element = store.find_element(By.CSS_SELECTOR, 'div.col-10.mb-1.store_address.small')
            address = address_element.text.strip()
            store_data.append({"Brand": "Starbucks", "StoreName": store_name, "Address": address})
        except:
            continue

    df_temp = pd.DataFrame(store_data)
    df = pd.concat([df, df_temp], ignore_index=True)

    driver.quit()

    # Step 2
    df[['Latitude', 'Longitude']] = df['Address'].apply(get_coordinates, client=gmap)

    return df

##### **1.1.4 CU**
**Steps:**
1. Make a AJAX request (https://nicetocu.com.my/wp-admin/admin-ajax.php?action=asl_load_stores&lang=&load_all=1&layout=1)
2. Get all the nessassary data from the JSON returned

In [11]:
def scarp_cu(api_url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(api_url, headers=headers)
    data = response.json()  

    store_list = []
    for store in data:
        store_list.append({
            "Brand": "CU",
            "StoreName": store.get("title", "N/A").strip(),
            "Latitude": store.get("lat", ""),
            "Longitude": store.get("lng", ""),
            "Address": f'{store.get("street", "")} {store.get("city", "")}, {store.get("state", "")}, {store.get("postal_code", "")}, {store.get("country", "Malaysia")}'.strip()
        })

    df = pd.DataFrame(store_list)

    return df

##### **1.2 Main**

In [6]:
# Get secret and initialise Google Maps API client
load_dotenv()
gmap_key = os.getenv("GC_API_KEY")
gmap = googlemaps.Client(key=gmap_key)

# Initialise Chrome profile and directory
profile_dir = 'C:\\Users\\hp\\AppData\\Local\\Google\\Chrome\\User Data\\'
profile = 'Profile 3'

# Initialise required url
mcd_url = r'https://www.mcdonalds.com.my/locate-us'
stb_url = r'https://www.starbucks.com.my/find-store'
cu_url = r'https://nicetocu.com.my/wp-admin/admin-ajax.php?action=asl_load_stores&lang=&load_all=1&layout=1'

In [7]:
# Scrap MCD
df_mcd = scrap_mcd(mcd_url, gmap)

if df_mcd.isnull().values.any():
    raise ValueError("Error: The DataFrame contains missing values. Please check the data.")

print('Number of stores: ', len(df_mcd))
display(df_mcd)

Number of stores:  374


Unnamed: 0,Brand,StoreName,Latitude,Longitude,Address
0,McDonald's,McDonald's Bukit Bintang,3.146867,101.710883,"120-120A Jalan Bukit Bintang, 55100, Kuala Lum..."
1,McDonald's,McDonald's Bangsar,3.132994,101.672298,"48, Jalan Telawi Lima, 59100, Bangsar Baru, Ba..."
2,McDonald's,McDonald's SS 2,3.118369,101.620442,"No. 56 Jalan SS 2/61, 47300, Petaling Jaya, Se..."
3,McDonald's,McDonald's Kepong,3.213555,101.632839,"No. 1 & 2 Jalan 54, Desa Jaya, 52100, Kepong, ..."
4,McDonald's,McDonald's Subang Parade,3.081489,101.585103,"LG 10 Subang Parade, No. 5 Jalan SS 16/1, 4750..."
...,...,...,...,...,...
369,McDonald's,McDonald's Batu Pahat 2 DT,1.837635,102.946740,"PTD 64538, Jalan Tanjung Labuh, 83000 Batu Pah..."
370,McDonald's,McDonald's Nusa Sentral DT,1.458875,103.623246,"Lot 155422, Jalan Nusa Sentral, Taman Nusa Sen..."
371,McDonald's,McDonald's BHP Raub DT,3.799162,101.864642,"Lot 10, Jalan Lipis, 27600, Pahang"
372,McDonald's,McDonald's Desa Petaling SF,3.082331,101.705000,"1-3, Pusat Perdagangan Salak II, Jalan 2/125, ..."


In [8]:
# Scrap Starbucks
df_stb = scrap_stb(stb_url, gmap, profile_dir, profile)

if df_stb.isnull().values.any():
    raise ValueError("Error: The DataFrame contains missing values. Please check the data.")

print('Number of stores: ', len(df_stb))
display(df_stb)

Number of stores:  329


Unnamed: 0,Brand,StoreName,Latitude,Longitude,Address
0,Starbucks,STARBUCKS PETRON SEKINCHAN DT,3.504621,101.103021,"17100C, JALAN BESAR, PARIT 4, SUNGAI BURONG, 4..."
1,Starbucks,STARBUCKS PETRON SEKINCHAN DT,3.504621,101.103021,"17100C, JALAN BESAR, PARIT 4, SUNGAI BURONG, 4..."
2,Starbucks,STARBUCKS KUCHAI LAMA,3.088920,101.703229,"NO 1 & 1-1, JALAN 10/116B, KUCHAI ENTREPRENEUR..."
3,Starbucks,STARBUCKS KOTA KEMUNING DT,3.005375,101.538630,"PART OF LOT PT128300, JALAN ANGGERIK VANILA,KO..."
4,Starbucks,STARBUCKS CLUB NEXU5,3.110602,101.666273,"LOT F&B 01, GROUND FLOOR, CLUB NEXU5, BANGSAR ..."
...,...,...,...,...,...
324,Starbucks,STARBUCKS AEON AYER KEROH,2.234230,102.282382,"LOT G33, AEON MELAKA SHOPPING CENTRE, JALAN TU..."
325,Starbucks,STARBUCKS RESERVE TRX EXCHANGE MALL,3.142407,101.716813,"L2.75.0 & L2.76.0, LEVEL 2, PLAZA THE EXCHANGE..."
326,Starbucks,STARBUCKS PAVILION DAMANSARA HEIGHTS,3.145789,101.663013,"PAVILION DAMANSARA HEIGHTS, LOT 2.5500 & E2.55..."
327,Starbucks,STARBUCKS TD CENTRAL DT,1.544344,103.765454,"NO. 6B, JALAN SAGU 18, TAMAN DAYA, 81100 JOHOR..."


In [12]:
# Scrap CU
df_cu = scarp_cu(cu_url)

if df_cu.isnull().values.any():
    raise ValueError("Error: The DataFrame contains missing values. Please check the data.")

print('Number of stores: ', len(df_cu))
display(df_cu)

Number of stores:  148


Unnamed: 0,Brand,StoreName,Latitude,Longitude,Address
0,CU,CU Alor Setar,6.096797172225901,100.35679224889525,"CU Alor Setar, 48, Jalan Kuala Kedah, Taman Gu..."
1,CU,CU Bukit Baru,2.2165749307179476,102.27048392286524,"PUSAT NIAGA NO,5-2, JALAN PNBBU 1, JALAN UTAMA..."
2,CU,CU Kuala Ketil,5.6051911453632055,100.64059232894348,"108 & 109, PERSIARAN KKCC 2, COMMERCIAL CENTRE..."
3,CU,CU Riverfront Sungai Petani,5.640101740221939,100.48100229403929,"CU Riverfront Sungai Petani, No.101, Tingkat, ..."
4,CU,CU Setia Alam,3.127528241500311,101.46854321857083,"10-G-58, JLN SETIA GEMILANG BM U13/BM SETIA AL..."
...,...,...,...,...,...
143,CU,CU Tropicana Aman Selangor,2.944712008898226,101.52739591385881,"NO. 9-G & 9-1, JALAN AMAN SINARIA 2 BANDAR TRO..."
144,CU,CU Tun Mohd Fuad TTDI,3.141664944730027,101.6284486437522,"CU TTDI, 11, Jalan Tun Mohd Fuad, Taman Tun Dr..."
145,CU,CU UMT Kuala Nerus Terengganu,5.4071227598139675,103.0893248106907,"CU Mart UMT Kuala Terengganu, Kuala Terengganu..."
146,CU,CU UTC Kota Bharu,6.1184157770440555,102.23988422774376,"LOT G1.01 & LOT G1.13, TINGKAT BAWAH BANGUNAN ..."


In [13]:
# Export to CSV
df = pd.concat([df_mcd, df_stb, df_cu], ignore_index=True)
df.to_csv("../csvs/output.csv", index=False)
print("Done. Check out csvs/output.csv for the output.")

Done. Check out csvs/output.csv for the output.
