## Manipulating roads file

In [1]:
import pandas as pd
# detailed_trajectory = pd.read_pickle("./data_with_trajectory_20s/courier_detailed_trajectory_20s.pkl.xz",compression="xz")
road_df=pd.read_csv("./LaDe/road-network/roads.csv",sep="\t")

In [4]:
unique_vals = road_df['city'].unique()        # numpy array
unique_list = road_df['city'].dropna().unique().tolist()
counts = road_df['city'].value_counts() 


In [8]:
print(f'Unique cities present in the dataset are : {unique_vals}') 

Unique cities present in the dataset are : ['杭州市' '吉林市' '上海市' '烟台市' '重庆市']


In [6]:
unique_list

['杭州市', '吉林市', '上海市', '烟台市', '重庆市']

In [15]:
print(f'Count of individual cities \n: {counts}')

Count of individual cities 
: city
上海市    163237
烟台市    128098
重庆市    121976
杭州市     97881
吉林市     20088
Name: count, dtype: int64


In [10]:
CITY_LITERAL_MAP = {
    "杭州市": "Hangzhou",
    "吉林市": "Jilin",
    "上海市": "Shanghai",
    "烟台市": "Yantai",
    "重庆市": "Chongqing",
}

def translate_and_update(df: pd.DataFrame, col: str = "city") -> pd.DataFrame:
    """
    Replace values in `col` with literal English translations using CITY_LITERAL_MAP,
    and create a new column 'city_chinese' containing the original Chinese values.
    Unknown values are left as-is in 'city_chinese' and mapped to empty string in 'city'.
    """
    if col not in df.columns:
        raise KeyError(f"Column '{col}' not found in DataFrame")

    # Create city_chinese column as copy with the original values
    df["city_chinese"] = df[col].astype(object)

    # Map to English literal names, use empty string for unmapped entries
    df[col] = df[col].map(CITY_LITERAL_MAP).fillna("")

    return df



In [14]:
original_road_df = road_df.copy(deep=True)

road_df = translate_and_update(road_df, "city")
print(road_df.head(5))

    osm_id  code     fclass  name  ref oneway  maxspeed  layer bridge tunnel  \
0  4296533  5141    service   NaN  NaN      B         0      0      F      F   
1  4296592  5115   tertiary   朝晖路  NaN      B         0      0      F      F   
2  4298707  5114  secondary  体育场路  NaN      F         0      0      F      F   
3  4298708  5113    primary   大关路  NaN      F         0      0      F      F   
4  4305243  5114  secondary  建国北路  NaN      F         0      0      F      F   

  city                                           geometry city_chinese  
0       LINESTRING (13373499.140421594 3538916.2947984...     Hangzhou  
1       LINESTRING (13376291.100042382 3539415.0794264...     Hangzhou  
2       LINESTRING (13375063.92510783 3538697.18123532...     Hangzhou  
3       LINESTRING (13375110.913064891 3543979.0569441...     Hangzhou  
4       LINESTRING (13377505.183804823 3540199.8977876...     Hangzhou  


In [16]:
def compare_city_counts(original_df: pd.DataFrame, new_df: pd.DataFrame, col_original: str = "city", col_new: str = "city") -> pd.DataFrame:
    """
    Compare value counts between original and new DataFrame city columns.
    - original_df: DataFrame before translation (should have original Chinese values)
    - new_df: DataFrame after translation (city column contains English; original Chinese kept elsewhere)
    - col_original: column name in original_df to count (default "city")
    - col_new: column name in new_df to count (default "city")
    Returns a DataFrame with columns: key, count_original, count_new, diff (new - original).
    """
    # compute counts
    orig_counts = original_df[col_original].value_counts(dropna=False).rename("count_original")
    new_counts = new_df[col_new].value_counts(dropna=False).rename("count_new")
    # align index (union of keys)
    combined = pd.concat([orig_counts, new_counts], axis=1).fillna(0).astype(int)
    combined["diff"] = combined["count_new"] - combined["count_original"]
    combined.index.name = "city"
    combined = combined.reset_index()
    return combined


summary = compare_city_counts(original_road_df, road_df, col_original="city", col_new="city_chinese")
print(summary.sort_values(by="count_original", ascending=False).to_string(index=False))

     city  count_original  count_new  diff
 Shanghai          163237     163237     0
   Yantai          128098     128098     0
Chongqing          121976     121976     0
 Hangzhou           97881      97881     0
    Jilin           20088      20088     0


In [None]:
road_df.to_csv("roads_translated.csv", index=False)