In [155]:
%reset -f

In [173]:
import importlib
import codes.preprocess_covid_china
importlib.reload(codes.preprocess_covid_china)

from codes.preprocess_covid_china import (
    load_and_print_city_name_mapping,
    load_filtered_baidu_mobility_files,
    compute_average_mobility_matrix_with_threshold,
    extract_nodes_and_edges_from_matrix,
    inspect_mobility_data,
    load_and_filter_covid_timeseries,
    build_temporal_graph_data
)

# Step 1: Load city name mappings (CH <-> EN)
ch_to_en, en_to_ch = load_and_print_city_name_mapping(
    "./data/China/dataverse_files/info/Index_City_CH_EN.csv"
)

# Step 2: Load Baidu inflow mobility data (filtered to valid CH cities, mapped to EN)
valid_chinese_cities = set(ch_to_en.keys())
mobility_data = load_filtered_baidu_mobility_files(
    data_dir="./data/China/dataverse_files",
    kind="in",
    city_ch_to_en=ch_to_en,
    valid_chinese_cities=valid_chinese_cities
)

# Step 3: Inspect matrices
inspect_mobility_data(mobility_data)

# Step 4: Compute average matrix (only using aligned cities, thresholded by 10% presence)
avg_matrix, all_cities_in_mobility = compute_average_mobility_matrix_with_threshold(
    mobility_data, threshold=0.1
)

# Step 5: Load COVID-19 timeseries for cities in mobility matrix
covid_df = load_and_filter_covid_timeseries(
    file_path="./data/China/Covid-19 daily cases in China/covid-19 daily confirmed cases.xlsx",
    valid_english_cities=set(all_cities_in_mobility)
)

# Step 6: Filter average matrix and city list to only those 276 matched cities
matched_cities = list(covid_df.index)
matched_indices = [all_cities_in_mobility.index(city) for city in matched_cities if city in all_cities_in_mobility]

# Create filtered matrix and city list
import numpy as np
filtered_avg_matrix = avg_matrix[np.ix_(matched_indices, matched_indices)]
filtered_cities = [all_cities_in_mobility[i] for i in matched_indices]

# Step 7: Build graph using only matched cities
G_matched = extract_nodes_and_edges_from_matrix(filtered_avg_matrix, filtered_cities)

# Step 8: Filter COVID time series to match graph nodes (255 cities)
final_cities = list(G_matched.nodes)
print(len(G_matched.nodes))
covid_df_final = covid_df.loc[covid_df.index.intersection(final_cities)]

print(f"[✓] Filtered COVID DataFrame shape: {covid_df_final.shape}")
print(f"[✓] Preview of aligned COVID time series data:")
print(covid_df_final.head())

# Optional: preview
print(f"[✓] Final graph has {G_matched.number_of_nodes()} nodes and {G_matched.number_of_edges()} edges.")


[✓] Loading city name mappings from: ./data/China/dataverse_files/info/Index_City_CH_EN.csv
[✓] Total mappings: 342
[✓] No missing Chinese names
[✓] No missing English names
[✓] No duplicated Chinese names
[✓] Duplicated English names: 5
[✓] Sample mappings (CH -> EN):
    北京市 -> Beijing
    天津市 -> Tianjin
    石家庄市 -> Shijiazhuang
    唐山市 -> Tangshan
    秦皇岛市 -> Qinhuangdao
    邯郸市 -> Handan
    邢台市 -> Xingtai
    保定市 -> Baoding
    张家口市 -> Zhangjiakou
    承德市 -> Chengde
[✓] Scanning for Baidu in files in: ./data/China/dataverse_files
[✓] Loaded 540 mobility matrices for kind='in'
[✓] Sample matrix on 2020-01-01:
              Beijing  Tianjin  Shanghai  Chongqing  Shijiazhuang  Tangshan  \
Beijing           NaN    17.46      2.20       1.64          7.89     11.84   
Tianjin          7.79      NaN      0.55       0.48          2.83     28.04   
Shanghai         1.62     0.83       NaN       1.93          0.52      0.35   
Chongqing        0.55     0.35      0.70        NaN          0.

In [174]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("cpu")
    print('Using cpu')
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

Using cpu
