In [13]:
import pandas as pd
import csv
import requests
import json
import re
import time
import os
from datetime import datetime, timedelta
import shutil
import xlsxwriter
import datetime
!pip install geojson
import geojson

Collecting geojson
  Downloading geojson-3.2.0-py3-none-any.whl.metadata (16 kB)
Downloading geojson-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: geojson
Successfully installed geojson-3.2.0


# 1. Data Downloading

## 1.1 Migration proportion data

In [1]:
# This dictionary maps city ID from the API to city names for filenames.
### If you need to fetch data for other cities, please change them here.

translation_table = {
    '440100': 'GuangZhou',
    '440300': 'ShenZhen',
    '440400': 'ZhuHai',
    '440600': 'FoShan',
    '440700': 'JiangMen',
    '441200': 'ZhaoQing',
    '441300': 'HuiZhou',
    '441900': 'DongGuan',
    '442000': 'ZhongShan',
}

In [2]:
class Types:
    """
    Store some data that we want to request.
    """
    
    data_type = {'cityrank'}

    dt = {'country',
          # 'province',
          'city'}
    
    ### If you need to fetch data for other cities, please change them here.
    region = {
        # Use Types.region.keys() to see which provinces are available;
        # Use Types.region["ID"] to check if it's a province or a city.
        '440100': 'city',
        '440300': 'city',
        '440400': 'city',
        '440600': 'city',
        '440700': 'city',
        '441200': 'city',
        '441300': 'city',
        '441900': 'city',
        '442000': 'city',
    }

    move_type = {# 'move_in',
                 'move_out'}

In [3]:
def generate_date_range(start_date_str:str, end_date_str:str):
    """
    Enter the dates on both ends and the output will include all dates including those two days.

    :param start_date_str: The start date in the 8-digit format of "yyyymmdd", e.g. "20220130".
    :param end_date_str: The end date in 8-digit format of "yyyymmdd", e.g. "20220131".
    :return: A list of date strings from start_date to end_date (inclusive),
             in the format "yyyymmdd".
    """
    # Convert the input date strings to datetime objects
    start_date = datetime.strptime(start_date_str, '%Y%m%d')
    end_date = datetime.strptime(end_date_str, '%Y%m%d')

    # Initialize the date list with the start date
    date_list = [start_date.strftime('%Y%m%d')]

    # Generate the date range
    current_date = start_date
    while current_date < end_date:
        current_date += timedelta(days=1)
        date_list.append(current_date.strftime('%Y%m%d'))

    return date_list

In [4]:
def get_timestamp():
    """
    Get current timestamp.

    :return: Timestamp with 13 digits
    """
    return str(int(time.time() * 1000))  # * 1000 to get millisecond timestamps

In [5]:
def get_lastdate():
    """
    Get the latest date that the current API has.

    :return: An 8-digit string, formatted as "yyyymmdd", such as “20201231”.
    """
    url = f'http://huiyan.baidu.com/migration/lastdate.jsonp'
    response = requests.get(url)
    json_data_match = re.search(r'{.*}', response.text)
    if json_data_match:
        json_data_str = json_data_match.group()

        # Decode JSON string, automatically handling Unicode characters
        json_data = json.loads(json_data_str)
        return_value = json_data['data']['lastdate']
        print(f"Successful acquisition of the latest API date：{return_value}")
        return return_value
    else:
        print('Failed to get latest API date!')

In [6]:
def download_and_convert_csv(data_type: str, dt: str, id: str, move_type: str, date: str, callback: str):
    """
    :param data_type: 'cityrank'
    :param dt: 'country'/'province'/'city'
    :param id: city ID，such as '440100'
    :param move_type: 'move_in' or 'move_out'
    :param date: data，formatted like '20230101'
    :param callback: 13-digit timestamp
    :return: Export CSV files directly under ./data/
    """

    # Get city names from translation_table
    city_name = translation_table.get(id, id)

    # Construct CSV filename
    csv_filename = f"./data/{city_name}_{move_type}_{date}.csv"

    # Construct the API request URL
    url = (
        f"http://huiyan.baidu.com/migration/cityrank.jsonp?"
        f"dt={dt}&id={id}&type={move_type}&date={date}"
        f"&callback=callback&_={callback}"
    )
    response = requests.get(url)

    # Extract JSON data using a regular expression
    json_data_match = re.search(r'{.*}', response.text)
    if not json_data_match:
        print(f"JSON data not matched: {url}")
        return

    json_data_str = json_data_match.group()
    json_data = json.loads(json_data_str)

    if json_data.get('errmsg') == 'SUCCESS':
        rows = json_data["data"].get("list", [])
        if not rows:
            print(f"{id}-{date} Empty")
            return

        # Export to a CSV file
        with open(csv_filename, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=rows[0].keys())
            writer.writeheader()
            writer.writerows(rows)

        print(f"CSV file exported to: {csv_filename}")
    else:
        print(f"The interface returns the error: {json_data.get('errmsg')}")

In [7]:
def get_data(region: str, data_type: str, move_type: str, date: str):
    """
    Get the migration data for the specified region, data_type, move_type, date and export to CSV.
    """
    timestamp = get_timestamp()
    
    # Print prompts, using either city names or raw numbers
    city_name = translation_table.get(region, region)
    print(f"Accessing {city_name}'s {data_type} ({move_type}) data，data={date} ...")
    
    download_and_convert_csv(
        data_type=data_type,
        dt=Types.region[region],  
        id=region,
        move_type=move_type,
        date=date,
        callback=str(timestamp) 
    )

In [None]:
def get_by_date(from_date, to_date, lastdate):
    # Convert a date string to a datetime object
    from_date_dt = datetime.strptime(from_date, '%Y%m%d')
    to_date_dt = datetime.strptime(to_date, '%Y%m%d')
    lastdate_dt = datetime.strptime(lastdate, '%Y%m%d')

    print("Checking the validity of date input")

    # Check if the start date is earlier than '20190112'
    earlydate = datetime(2019, 1, 12)
    if from_date_dt < earlydate:
        print("Your start date is earlier than the earliest date the API currently has. Already set the start date to the earliest date the API has and try to keep running.")
        from_date_dt = earlydate

    # Check if the end date is later than the latest date
    if to_date_dt > lastdate_dt:
        print("Your end date is later than the latest date the API currently has. Already set the end date to the latest date the API has and try to keep running.")
        to_date_dt = lastdate_dt

    # Check if start data 
    if from_date_dt > to_date_dt:
        print("Your start date needs to be earlier than your end date.")
        return

    print("Success!")

    # Convert to a string to generate a date range
    from_date = from_date_dt.strftime('%Y%m%d')
    to_date = to_date_dt.strftime('%Y%m%d')

    for date in generate_date_range(from_date, to_date):  # data
        for region in Types.region.keys():
            for move_type in Types.move_type:
                for data_type in Types.data_type:
                    get_data(region, data_type, move_type, date)
    print("Completed!")


if __name__ == "__main__":
    # Check if the file exists
    if not os.path.exists('data'):
        # If not, create a fle
        os.makedirs('data')
    lastdate = get_lastdate()  
    ### If you need to change the start and end dates, change them here
    get_by_date('20240101', '20241231', lastdate)

Successful acquisition of the latest API date：20250410
Checking the validity of date input
Success!
Accessing GuangZhou's cityrank (move_out) data，data=20240101 ...
CSV file exported to: ./data/GuangZhou_move_out_20240101.csv
Accessing ShenZhen's cityrank (move_out) data，data=20240101 ...
CSV file exported to: ./data/ShenZhen_move_out_20240101.csv
Accessing ZhuHai's cityrank (move_out) data，data=20240101 ...
CSV file exported to: ./data/ZhuHai_move_out_20240101.csv
Accessing FoShan's cityrank (move_out) data，data=20240101 ...
CSV file exported to: ./data/FoShan_move_out_20240101.csv
Accessing JiangMen's cityrank (move_out) data，data=20240101 ...
CSV file exported to: ./data/JiangMen_move_out_20240101.csv
Accessing ZhaoQing's cityrank (move_out) data，data=20240101 ...
CSV file exported to: ./data/ZhaoQing_move_out_20240101.csv
Accessing HuiZhou's cityrank (move_out) data，data=20240101 ...
CSV file exported to: ./data/HuiZhou_move_out_20240101.csv
Accessing DongGuan's cityrank (move_out)

## 1.2 Migration index data

In [15]:
# This dictionary maps city ID from the API to city names for filenames.
### If you need to fetch data for other cities, please change them here.

CitiesCode = {'GuangZhou':440100, 'ShenZhen':440300,'ZhuHai':440400, 
              'FoShan':440600,'JiangMen':440700, 'ZhaoQing':441200,
              'HuiZhou':441300, 'DongGuan':441900, 'ZhongShan':442000,}

In [16]:
def migration_index(FileTittle, classname, direction, CodeDict):
    # if direction == 'in':
        # nameofdire = 'move_in'
    if direction == 'out':
        nameofdire = 'move_out'

    output_folder = "data/1_2_index_raw"
    os.makedirs(output_folder, exist_ok=True) 
    save_path = os.path.join(output_folder, f"{FileTittle}_{nameofdire}_index_all_years.xlsx")
    workbook = xlsxwriter.Workbook(save_path)
    worksheet = workbook.add_worksheet('Sheet')

    CitiesOrder = {}  # Empty dictionary for storing city numbers
    worksheet.write(0, 0, 'City Code')  # Write to header
    worksheet.write(0, 1, 'City Name')  # Write to header
    times = 1
    for key, value in CodeDict.items():
        worksheet.write(times, 0, str(value))  # Write to city code
        worksheet.write(times, 1, str(key))  # Write to city name
        CitiesOrder[str(key)] = times  # Write to dictionary for storing city numbers
        times += 1

    for Area, Code in CodeDict.items():
        url = f'http://huiyan.baidu.com/migration/historycurve.json?dt={classname}&id={Code}&type=move_{direction}'
        print(f'{Area}: {url}')
        response = requests.get(url, timeout=2)  # request
        time.sleep(3)
        print(response.text)  # Chech if JSON is valid
        data_dict = response.json()  # Parsing JSON
        if data_dict['errmsg'] == 'SUCCESS':
            data_list = data_dict['data']['list']
            counter_date = 2  # Date counter
            dateList = []
            for date, index in data_list.items():
                dateList.append(date)  # Sort by date
            dateList.sort()
            for date in dateList:
                index = data_list[date]
                worksheet.write(0, counter_date, float(date))
                worksheet.write(CitiesOrder[str(Area)], counter_date, float(index))
                counter_date += 1  # Date counter + 1
        else:
            print('Error')
    workbook.close()

quanguo = {'whole country': 0}  # Country Code

if __name__ == "__main__":
    # migration_index('city', 'city', 'in', CitiesCode)
    migration_index('city', 'city', 'out', CitiesCode)
    print('Completed!')

GuangZhou: http://huiyan.baidu.com/migration/historycurve.json?dt=city&id=440100&type=move_out

{"errno":0,"errmsg":"SUCCESS","data":{"list":{"20190112":10.976148,"20190113":9.3407256,"20190114":9.3356388,"20190115":8.66457,"20190116":9.29394,"20190117":9.6909048,"20190118":9.9609264,"20190119":11.75634,"20190120":10.853514,"20190121":11.5344,"20190122":11.9813904,"20190123":12.9839112,"20190124":13.4913924,"20190125":14.4756396,"20190126":18.0603108,"20190127":17.7778476,"20190128":18.0008568,"20190129":17.0088012,"20190130":16.0237764,"20190131":16.361676,"20190201":17.5477428,"20190202":16.5084804,"20190203":14.2816284,"20190204":11.4904656,"20190205":7.7750604,"20190206":9.8193384,"20190207":10.2669444,"20190208":9.0518796,"20190209":7.7014476,"20190210":6.8878512,"20190211":6.3177732,"20190212":6.0326856,"20190213":6.0744816,"20190214":6.3319644,"20190215":6.3365004,"20190216":7.8240816,"20190217":6.9613344,"20190218":6.89553,"20190219":6.2445492,"20190220":6.107724,"20190221":6.0