In [67]:
import requests
import json
import re
import time
import os
from datetime import datetime, timedelta
import pandas as pd
import csv
import shutil

# 1. Data

## 1.1 Data downloading

In [21]:
# This dictionary maps city ID from the API to city names for filenames.
### If you need to fetch data for other cities, please change them here.

translation_table = {
    '440100': 'GuangZhou',
    '440300': 'ShenZhen',
    '440400': 'ZhuHai',
    '440600': 'FoShan',
    '440700': 'JiangMen',
    '441200': 'ZhaoQing',
    '441300': 'HuiZhou',
    '441900': 'DongGuan',
    '442000': 'ZhongShan',
}

In [22]:
class Types:
    """
    Store some data that we want to request.
    """
    
    data_type = {'cityrank'}

    dt = {'country',
          'province',
          'city'}
    
    ### If you need to fetch data for other cities, please change them here.
    region = {
        # Use Types.region.keys() to see which provinces are available;
        # Use Types.region["ID"] to check if it's a province or a city.
        '440100': 'city',
        '440300': 'city',
        '440400': 'city',
        '440600': 'city',
        '440700': 'city',
        '441200': 'city',
        '441300': 'city',
        '441900': 'city',
        '442000': 'city',
    }

    move_type = {'move_in',
                 'move_out'}

In [23]:
def generate_date_range(start_date_str:str, end_date_str:str):
    """
    Enter the dates on both ends and the output will include all dates including those two days.

    :param start_date_str: The start date in the 8-digit format of "yyyymmdd", e.g. "20220130".
    :param end_date_str: The end date in 8-digit format of "yyyymmdd", e.g. "20220131".
    :return: A list of date strings from start_date to end_date (inclusive),
             in the format "yyyymmdd".
    """
    # Convert the input date strings to datetime objects
    start_date = datetime.strptime(start_date_str, '%Y%m%d')
    end_date = datetime.strptime(end_date_str, '%Y%m%d')

    # Initialize the date list with the start date
    date_list = [start_date.strftime('%Y%m%d')]

    # Generate the date range
    current_date = start_date
    while current_date < end_date:
        current_date += timedelta(days=1)
        date_list.append(current_date.strftime('%Y%m%d'))

    return date_list

In [24]:
def get_timestamp():
    """
    Get current timestamp.

    :return: Timestamp with 13 digits
    """
    return str(int(time.time() * 1000))  # * 1000 to get millisecond timestamps

In [25]:
def get_lastdate():
    """
    Get the latest date that the current API has.

    :return: An 8-digit string, formatted as "yyyymmdd", such as “20201231”.
    """
    url = f'http://huiyan.baidu.com/migration/lastdate.jsonp'
    response = requests.get(url)
    json_data_match = re.search(r'{.*}', response.text)
    if json_data_match:
        json_data_str = json_data_match.group()

        # Decode JSON string, automatically handling Unicode characters
        json_data = json.loads(json_data_str)
        return_value = json_data['data']['lastdate']
        print(f"Successful acquisition of the latest API date：{return_value}")
        return return_value
    else:
        print('Failed to get latest API date!')

In [26]:
def download_and_convert_csv(data_type: str, dt: str, id: str, move_type: str, date: str, callback: str):
    """
    :param data_type: 'cityrank'
    :param dt: 'country'/'province'/'city'
    :param id: city ID，such as '440100'
    :param move_type: 'move_in' or 'move_out'
    :param date: data，formatted like '20230101'
    :param callback: 13-digit timestamp
    :return: Export CSV files directly under ./data/
    """

    # Get city names from translation_table
    city_name = translation_table.get(id, id)

    # Construct CSV filename
    csv_filename = f"./data/{city_name}_{move_type}_{date}.csv"

    # Construct the API request URL
    url = (
        f"http://huiyan.baidu.com/migration/cityrank.jsonp?"
        f"dt={dt}&id={id}&type={move_type}&date={date}"
        f"&callback=callback&_={callback}"
    )
    response = requests.get(url)

    # Extract JSON data using a regular expression
    json_data_match = re.search(r'{.*}', response.text)
    if not json_data_match:
        print(f"JSON data not matched: {url}")
        return

    json_data_str = json_data_match.group()
    json_data = json.loads(json_data_str)

    if json_data.get('errmsg') == 'SUCCESS':
        rows = json_data["data"].get("list", [])
        if not rows:
            print(f"{id}-{date} Empty")
            return

        # Export to a CSV file
        with open(csv_filename, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=rows[0].keys())
            writer.writeheader()
            writer.writerows(rows)

        print(f"CSV file exported to: {csv_filename}")
    else:
        print(f"The interface returns the error: {json_data.get('errmsg')}")

In [27]:
def get_data(region: str, data_type: str, move_type: str, date: str):
    """
    Get the migration data for the specified region, data_type, move_type, date and export to CSV.
    """
    timestamp = get_timestamp()
    
    # Print prompts, using either city names or raw numbers
    city_name = translation_table.get(region, region)
    print(f"Accessing {city_name}'s {data_type} ({move_type}) data，data={date} ...")
    
    download_and_convert_csv(
        data_type=data_type,
        dt=Types.region[region],  
        id=region,
        move_type=move_type,
        date=date,
        callback=str(timestamp) 
    )

In [28]:
def get_by_date(from_date, to_date, lastdate):
    # Convert a date string to a datetime object
    from_date_dt = datetime.strptime(from_date, '%Y%m%d')
    to_date_dt = datetime.strptime(to_date, '%Y%m%d')
    lastdate_dt = datetime.strptime(lastdate, '%Y%m%d')

    print("Checking the validity of date input")

    # Check if the start date is earlier than '20190112'
    earlydate = datetime(2019, 1, 12)
    if from_date_dt < earlydate:
        print("Your start date is earlier than the earliest date the API currently has. Already set the start date to the earliest date the API has and try to keep running.")
        from_date_dt = earlydate

    # Check if the end date is later than the latest date
    if to_date_dt > lastdate_dt:
        print("Your end date is later than the latest date the API currently has. Already set the end date to the latest date the API has and try to keep running.")
        to_date_dt = lastdate_dt

    # Check if start data 
    if from_date_dt > to_date_dt:
        print("Your start date needs to be earlier than your end date.")
        return

    print("Success!")

    # Convert to a string to generate a date range
    from_date = from_date_dt.strftime('%Y%m%d')
    to_date = to_date_dt.strftime('%Y%m%d')

    for date in generate_date_range(from_date, to_date):  # data
        for region in Types.region.keys():
            for move_type in Types.move_type:
                for data_type in Types.data_type:
                    get_data(region, data_type, move_type, date)
    print("Completed!")


if __name__ == "__main__":
    # Check if the file exists
    if not os.path.exists('data'):
        # If not, create a fle
        os.makedirs('data')
    lastdate = get_lastdate()  
    ### If you need to change the start and end dates, change them here
    get_by_date('20241101', '20241130', lastdate)

Successful acquisition of the latest API date：20250220
Checking the validity of date input
Success!
Accessing GuangZhou's cityrank (move_out) data，data=20241101 ...
CSV file exported to: ./data/GuangZhou_move_out_20241101.csv
Accessing GuangZhou's cityrank (move_in) data，data=20241101 ...
CSV file exported to: ./data/GuangZhou_move_in_20241101.csv
Accessing ShenZhen's cityrank (move_out) data，data=20241101 ...
CSV file exported to: ./data/ShenZhen_move_out_20241101.csv
Accessing ShenZhen's cityrank (move_in) data，data=20241101 ...
CSV file exported to: ./data/ShenZhen_move_in_20241101.csv
Accessing ZhuHai's cityrank (move_out) data，data=20241101 ...
CSV file exported to: ./data/ZhuHai_move_out_20241101.csv
Accessing ZhuHai's cityrank (move_in) data，data=20241101 ...
CSV file exported to: ./data/ZhuHai_move_in_20241101.csv
Accessing FoShan's cityrank (move_out) data，data=20241101 ...
CSV file exported to: ./data/FoShan_move_out_20241101.csv
Accessing FoShan's cityrank (move_in) data，dat

## 1.2 Data processing

### The original data downloaded from Baidu includes mobility data between each Greater Bay Area city and 100 other Chinese cities. Here, we retain only the mobility data between each GBA city and the other eight GBA cities.

In [37]:
# For filtering
gba_city_names = {
    "广州市",  # 440100
    "深圳市",  # 440300
    "珠海市",  # 440400
    "佛山市",  # 440600
    "江门市",  # 440700
    "肇庆市",  # 441200
    "惠州市",  # 441300
    "东莞市",  # 441900
    "中山市",  # 442000
}

# For later use when converting fields from Chinese to English
city_map = {
    "广州市": "GuangZhou",
    "深圳市": "ShenZhen",
    "珠海市": "ZhuHai",
    "佛山市": "FoShan",
    "江门市": "JiangMen",
    "肇庆市": "ZhaoQing",
    "惠州市": "HuiZhou",
    "东莞市": "DongGuan",
    "中山市": "ZhongShan",
}

# Data source folder and filtered folder that keeps GBA cities only
source_folder = "data/baidu_mobility_index_original"

filtered_folder = "data/baidu_mobility_index_gba"
os.makedirs(filtered_folder, exist_ok=True)

# Filtering
csv_files = sorted(f for f in os.listdir(source_folder) if f.endswith(".csv"))

for file_name in csv_files:
    source_path = os.path.join(source_folder, file_name)
    
    # Read CSV
    df = pd.read_csv(source_path, encoding="utf-8")
    
    # Keep GBA cities only
    df_filtered = df[df["city_name"].isin(gba_city_names)]

    # Delete "province_name" column
    df_filtered.drop(columns=["province_name"], errors="ignore", inplace=True)

    # Convert fields from Chinese to English
    df_filtered["city_name"] = df_filtered["city_name"].replace(city_map)

    # Export
    out_path = os.path.join(filtered_folder, file_name)
    df_filtered.to_csv(out_path, index=False, encoding="utf-8")
    
    print(f"Processed {file_name}, kept {len(df_filtered)} rows.")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=["province_name"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["city_name"] = df_filtered["city_name"].replace(city_map)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=["province_name"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

Processed DongGuan_move_in_20241101.csv, kept 8 rows.
Processed DongGuan_move_in_20241102.csv, kept 8 rows.
Processed DongGuan_move_in_20241103.csv, kept 8 rows.
Processed DongGuan_move_in_20241104.csv, kept 8 rows.
Processed DongGuan_move_in_20241105.csv, kept 8 rows.
Processed DongGuan_move_in_20241106.csv, kept 8 rows.
Processed DongGuan_move_in_20241107.csv, kept 8 rows.
Processed DongGuan_move_in_20241108.csv, kept 8 rows.
Processed DongGuan_move_in_20241109.csv, kept 8 rows.
Processed DongGuan_move_in_20241110.csv, kept 8 rows.
Processed DongGuan_move_in_20241111.csv, kept 8 rows.
Processed DongGuan_move_in_20241112.csv, kept 8 rows.
Processed DongGuan_move_in_20241113.csv, kept 8 rows.
Processed DongGuan_move_in_20241114.csv, kept 8 rows.
Processed DongGuan_move_in_20241115.csv, kept 8 rows.
Processed DongGuan_move_in_20241116.csv, kept 8 rows.
Processed DongGuan_move_in_20241117.csv, kept 8 rows.
Processed DongGuan_move_in_20241118.csv, kept 8 rows.
Processed DongGuan_move_in_2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=["province_name"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["city_name"] = df_filtered["city_name"].replace(city_map)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=["province_name"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

Processed JiangMen_move_out_20241104.csv, kept 8 rows.
Processed JiangMen_move_out_20241105.csv, kept 8 rows.
Processed JiangMen_move_out_20241106.csv, kept 8 rows.
Processed JiangMen_move_out_20241107.csv, kept 8 rows.
Processed JiangMen_move_out_20241108.csv, kept 8 rows.
Processed JiangMen_move_out_20241109.csv, kept 8 rows.
Processed JiangMen_move_out_20241110.csv, kept 8 rows.
Processed JiangMen_move_out_20241111.csv, kept 8 rows.
Processed JiangMen_move_out_20241112.csv, kept 8 rows.
Processed JiangMen_move_out_20241113.csv, kept 8 rows.
Processed JiangMen_move_out_20241114.csv, kept 8 rows.
Processed JiangMen_move_out_20241115.csv, kept 8 rows.
Processed JiangMen_move_out_20241116.csv, kept 8 rows.
Processed JiangMen_move_out_20241117.csv, kept 8 rows.
Processed JiangMen_move_out_20241118.csv, kept 8 rows.
Processed JiangMen_move_out_20241119.csv, kept 8 rows.
Processed JiangMen_move_out_20241120.csv, kept 8 rows.
Processed JiangMen_move_out_20241121.csv, kept 8 rows.
Processed 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=["province_name"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["city_name"] = df_filtered["city_name"].replace(city_map)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=["province_name"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

Processed ZhaoQing_move_out_20241110.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241111.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241112.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241113.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241114.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241115.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241116.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241117.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241118.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241119.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241120.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241121.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241122.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241123.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241124.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241125.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241126.csv, kept 8 rows.
Processed ZhaoQing_move_out_20241127.csv, kept 8 rows.
Processed 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=["province_name"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["city_name"] = df_filtered["city_name"].replace(city_map)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=["province_name"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

# 2. Flow Statistics

## 2.1 move-in and move-out flow

### Each GBA city currently has two folders: move_in and move_out. Further classification is needed by splitting each folder into: 
### move_in_weekday, move_in_weekend, move_out_weekday, and move_out_weekend.

In [74]:
# Data file pathway
base_path = "data/baidu_mobility_index_gba" 

# Access all subfolders in data file（exclude .ipynb_checkpoints）
subfolders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f)) and not f.startswith(".")]

# Task 1: check if subfiles are correctly stored, each file should contain a city's move in/out data for 30 days
def check_folder_files(subfolders):
    inconsistent_folders = []

    for folder in subfolders:
        folder_path = os.path.join(base_path, folder)
        files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

        # Prefixes should be identical（remove last 6 digits，DD.csv）
        if files:
            prefixes = set(f[:-6] for f in files)

            if len(prefixes) > 1:  # if they are not identical, report error
                inconsistent_folders.append(folder)

    return inconsistent_folders

inconsistent_folders = check_folder_files(subfolders)

if inconsistent_folders:
    print("The following files may be allocated to wrong folders:", inconsistent_folders)
else:
    print("Subfiles are correctly stored. Move to Task 2.")


    # Task 2: Reclassify be name
    for folder in subfolders:
        folder_path = os.path.join(base_path, folder)
        weekday_path = os.path.join(folder_path, "weekday")
        weekend_path = os.path.join(folder_path, "weekend")

        os.makedirs(weekday_path, exist_ok=True)
        os.makedirs(weekend_path, exist_ok=True)

        files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

        for file in files:
            date_str = file[-12:-4] 

            try:
                date_obj = datetime.strptime(date_str, "%Y%m%d")
                destination_folder = weekday_path if date_obj.weekday() < 5 else weekend_path

                shutil.move(os.path.join(folder_path, file), os.path.join(destination_folder, file))

            except ValueError:
                print(f"文件 {file} 日期解析失败，跳过。")
                
print("Task 2 completed!")

Subfiles are correctly stored. Move to Task 2.
Task 2 completed!


### 2.1.1 move_in_weekday

In [78]:
# Pathway
base_path = "data/baidu_mobility_index_gba/move_in_weekday"

# 9 GBA cities
cities = ["DongGuan", "FoShan", "GuangZhou", "HuiZhou", "JiangMen",
          "ShenZhen", "ZhaoQing", "ZhongShan", "ZhuHai"]

# Initialise dictionary
data = {city: [] for city in cities}

# Access all *_move_in_weekday* files
folders = [f for f in os.listdir(base_path) if f.endswith("_move_in_weekday")]

# Iterate through folders
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    city_name = folder.replace("_move_in_weekday", "")  # Extract city name

    if city_name not in cities:
        print(f" Unknown city {city_name}, skip")
        continue

    if os.path.exists(folder_path):
        files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

        for file in files:
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)

            # Ensure correct format
            if "city_name" in df.columns and "value" in df.columns:
                df = df[df["city_name"].isin(cities)]  # Filter only 9 cities
                df.set_index("city_name", inplace=True)

                # Store data
                data[city_name].append(df["value"])

# Compute 9×9 matrix
matrix = pd.DataFrame(index=cities, columns=cities, dtype=float)

for city in cities:
    if data[city]:  # Ensure non-empty data
        avg_values = pd.concat(data[city], axis=1).mean(axis=1)  # Calculate mean
        matrix.loc[city] = avg_values.reindex(cities, fill_value=0).values
    else:
        matrix.loc[city] = 0  # If empty, take 0

# Display Matrix
print(" 9×9 matrix of move_in_weekday:")
print(matrix)

# Export
output_path = os.path.join(base_path, "average_move_in_weekday_matrix.csv")
matrix.to_csv(output_path)
print(f" Matrix is saved to {output_path}")

 9×9 matrix of move_in_weekday:
            DongGuan     FoShan  GuangZhou    HuiZhou  JiangMen   ShenZhen  \
DongGuan    0.000000   4.014762  16.821429  13.546667  1.008095  38.642857   
FoShan      4.397143   0.000000  45.516667   1.271429  5.597619   3.488095   
GuangZhou  11.729048  30.248095   0.000000   3.722857  2.622857   7.270476   
HuiZhou    25.387143   2.439524  10.035714   0.000000  0.490476  36.486190   
JiangMen    4.658095  23.504286  15.305238   1.117619  0.000000   4.662857   
ShenZhen   31.356190   2.889048   9.523810  18.360000  0.947619   0.000000   
ZhaoQing    3.810476  34.606190  19.767619   1.071905  2.706667   2.919048   
ZhongShan   4.276667  18.462381  10.888095   1.096190  9.845238   5.508095   
ZhuHai      2.793333   3.812381   8.333810   0.915714  5.227143   5.247619   

           ZhaoQing  ZhongShan     ZhuHai  
DongGuan   0.749524   1.899524   0.840476  
FoShan     7.327619   9.215238   1.164762  
GuangZhou  2.891905   3.731905   1.894762  
HuiZhou    

### 2.1.2 move_in_weekend

In [80]:
# Pathway
base_path = "data/baidu_mobility_index_gba/move_in_weekend"

# 9 GBA cities
cities = ["DongGuan", "FoShan", "GuangZhou", "HuiZhou", "JiangMen",
          "ShenZhen", "ZhaoQing", "ZhongShan", "ZhuHai"]

# Initialise dictionary
data = {city: [] for city in cities}

# Access all *_move_in_weekend* files
folders = [f for f in os.listdir(base_path) if f.endswith("_move_in_weekend")]

# Iterate through folders
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    city_name = folder.replace("_move_in_weekend", "")  # Extract city name

    if city_name not in cities:
        print(f"Unknown city {city_name}, skip")
        continue

    if os.path.exists(folder_path):
        files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

        for file in files:
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)

            # Ensure correct format
            if "city_name" in df.columns and "value" in df.columns:
                df = df[df["city_name"].isin(cities)]  # Filter only 9 cities
                df.set_index("city_name", inplace=True)

                # Store data
                data[city_name].append(df["value"])

# Compute 9×9 matrix
matrix = pd.DataFrame(index=cities, columns=cities, dtype=float)

for city in cities:
    if data[city]:  # Ensure non-empty data
        avg_values = pd.concat(data[city], axis=1).mean(axis=1)  # Calculate mean
        matrix.loc[city] = avg_values.reindex(cities, fill_value=0).values
    else:
        matrix.loc[city] = 0  # If empty, take 0

# Display Matrix
print("9×9 matrix of move_in_weekend:")
print(matrix)

# Export
output_path = os.path.join(base_path, "average_move_in_weekend_matrix.csv")
matrix.to_csv(output_path)
print(f"Matrix is saved to {output_path}")

Unknown city Foshan, skip
9×9 matrix of move_in_weekend:
            DongGuan     FoShan  GuangZhou    HuiZhou  JiangMen   ShenZhen  \
DongGuan    0.000000   4.068889  17.134444  13.601111  1.033333  41.125556   
FoShan      0.000000   0.000000   0.000000   0.000000  0.000000   0.000000   
GuangZhou  12.815556  27.950000   0.000000   4.155556  2.931111   9.928889   
HuiZhou    25.741111   2.174444  10.034444   0.000000  0.423333  41.037778   
JiangMen    4.731111  23.336667  18.071111   1.048889  0.000000   5.621111   
ShenZhen   31.765556   3.171111  10.976667  17.637778  1.096667   0.000000   
ZhaoQing    4.065556  34.417778  23.173333   1.034444  2.764444   3.643333   
ZhongShan   4.623333  16.643333  12.346667   1.176667  9.610000   8.408889   
ZhuHai      3.763333   5.050000  11.865556   1.266667  5.665556   8.638889   

           ZhaoQing  ZhongShan     ZhuHai  
DongGuan   0.773333   1.956667   0.945556  
FoShan     0.000000   0.000000   0.000000  
GuangZhou  3.245556   4.058889

### 2.1.3 move_out_weekday

In [83]:
# Pathway
base_path = "data/baidu_mobility_index_gba/move_out_weekday"

# 9 GBA cities
cities = ["DongGuan", "FoShan", "GuangZhou", "HuiZhou", "JiangMen",
          "ShenZhen", "ZhaoQing", "ZhongShan", "ZhuHai"]

# Initialise dictionary
data = {city: [] for city in cities}

# Access all *_move_out_weekday* files
folders = [f for f in os.listdir(base_path) if f.endswith("_move_out_weekday")]

# Iterate through folders
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    city_name = folder.replace("_move_out_weekday", "")  # Extract city name

    if city_name not in cities:
        print(f"Unknown city {city_name}, skip")
        continue

    if os.path.exists(folder_path):
        files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

        for file in files:
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)

            # Ensure correct format
            if "city_name" in df.columns and "value" in df.columns:
                df = df[df["city_name"].isin(cities)]  # Filter only 9 cities
                df.set_index("city_name", inplace=True)

                # Store data
                data[city_name].append(df["value"])

# Compute 9×9 matrix
matrix = pd.DataFrame(index=cities, columns=cities, dtype=float)

for city in cities:
    if data[city]:  # Ensure non-empty data
        avg_values = pd.concat(data[city], axis=1).mean(axis=1)  # Calculate mean
        matrix.loc[city] = avg_values.reindex(cities, fill_value=0).values
    else:
        matrix.loc[city] = 0  # If empty, take 0

# Display Matrix
print("9×9 matrix of move_out_weekday:")
print(matrix)

# Export
output_path = os.path.join(base_path, "average_move_out_weekday_matrix.csv")
matrix.to_csv(output_path)
print(f"Matrix is saved to {output_path}")

9×9 matrix of move_out_weekday:
            DongGuan     FoShan  GuangZhou    HuiZhou  JiangMen   ShenZhen  \
DongGuan    0.000000   4.170000  17.558571  14.077143  1.199524  36.460000   
FoShan      3.826667   0.000000  45.427143   1.352857  6.087619   3.382857   
GuangZhou  10.980000  29.648095   0.000000   3.814762  2.709524   7.630952   
HuiZhou    23.031905   2.169048  10.027143   0.000000  0.514286  38.177619   
JiangMen    3.897143  21.528571  15.964286   1.104762  0.000000   4.495238   
ShenZhen   31.682381   2.847619   9.394286  17.461429  1.032381   0.000000   
ZhaoQing    3.450000  33.672857  20.996667   1.083333  2.840000   2.955714   
ZhongShan   3.531429  17.083810  10.920952   0.968571  9.954762   5.183333   
ZhuHai      2.356667   3.243333   8.351429   0.871429  4.742857   5.216190   

           ZhaoQing  ZhongShan     ZhuHai  
DongGuan   0.822857   2.052381   1.061429  
FoShan     7.523810   8.891905   1.451429  
GuangZhou  2.942857   3.592857   2.185238  
HuiZhou    

### 2.1.4 move_out_weekend

In [77]:
# pathway
base_path = "data/baidu_mobility_index_gba/move_out_weekend"

# 9 GBA cities
cities = ["DongGuan", "FoShan", "GuangZhou", "HuiZhou", "JiangMen", 
          "ShenZhen", "ZhaoQing", "ZhongShan", "ZhuHai"]

# Initialise dictionary
data = {city: [] for city in cities}

# Access all `*_move_out_weekend` files
folders = [f for f in os.listdir(base_path) if f.endswith("_move_out_weekend")]

# Iterate
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    city_name = folder.replace("_move_out_weekend", "")  # Extract city names

    if city_name not in cities:
        print(f"⚠️ Unknown city {city_name}，skip")
        continue

    if os.path.exists(folder_path):
        files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

        for file in files:
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)

            # Ensure correct format
            if "city_name" in df.columns and "value" in df.columns:
                df = df[df["city_name"].isin(cities)]  # 只保留大湾区 9 个城市
                df.set_index("city_name", inplace=True)

                # same data
                data[city_name].append(df["value"])

# 9*9 matrix
matrix = pd.DataFrame(index=cities, columns=cities, dtype=float)

for city in cities:
    if data[city]:  # Make sure non-empty data
        avg_values = pd.concat(data[city], axis=1).mean(axis=1)  # Calculate mean
        matrix.loc[city] = avg_values.reindex(cities, fill_value=0).values
    else:
        matrix.loc[city] = 0  # If empty, take 0

# Display Matrix
print("9×9 matrix of move_out_weekend:")
print(matrix)

# Export
output_path = os.path.join(base_path, "average_move_out_weekend_matrix.csv")
matrix.to_csv(output_path)
print(f"📁 Matrix is saved to {output_path}")

9×9 matrix of move_out_weekend:
            DongGuan     FoShan  GuangZhou    HuiZhou   JiangMen   ShenZhen  \
DongGuan    0.000000   4.395556  17.692222  16.570000   1.290000  33.827778   
FoShan      3.831111   0.000000  42.053333   1.532222   6.900000   3.668889   
GuangZhou  10.038889  28.833333   0.000000   4.368889   3.295556   7.832222   
HuiZhou    22.471111   2.498889  10.980000   0.000000   0.546667  35.875556   
JiangMen    3.760000  21.723333  17.021111   1.158889   0.000000   4.924444   
ShenZhen   27.936667   3.433333  10.571111  20.655556   1.172222   0.000000   
ZhaoQing    3.301111  32.155556  22.023333   1.101111   3.116667   3.208889   
ZhongShan   3.800000  17.500000  12.580000   1.224444  11.046667   6.302222   
ZhuHai      2.586667   4.526667  10.362222   1.068889   6.238889   6.226667   

           ZhaoQing  ZhongShan     ZhuHai  
DongGuan   0.957778   2.082222   1.303333  
FoShan     8.805556   8.144444   1.881111  
GuangZhou  3.662222   3.748889   2.736667  
H