In [1]:
import pandas as pd

In [2]:
citizen_df = pd.read_excel('../../excel-files/citizen.xlsx')
service_df = pd.read_excel('../../excel-files/emon.service.xlsx')
request_df_sheets = pd.read_excel('../../excel-files/request.xlsx', sheet_name=None)

In [4]:
request_df = pd.concat(request_df_sheets.values())

In [5]:
request_df.rename(columns={"createc_date": "created_date", "service_id": "serviceid"}, inplace=True)
service_df.rename(columns={"_id": "serviceid"}, inplace=True)

In [20]:
service_df["serviceid"].nunique(), request_df["serviceid"].nunique()

(2620, 949)

In [21]:
service_ids_1 = set(service_df['serviceid'].unique())
service_ids_2 = set(request_df['serviceid'].unique())

all_same = (service_ids_1 == service_ids_2)
print("All DataFrames have the same serviceid values:", all_same)

common_service_ids = service_ids_1.intersection(service_ids_2)
print("Common serviceid values:", common_service_ids)
print("Number of common serviceid values:", len(common_service_ids))

All DataFrames have the same serviceid values: False
Common serviceid values: {'5f4497a43b9c5d45bbcd524a', '5db7af603f379544a40cd95c', '6162f29230d778359cacf900', '600f712f473a915031ba33d5', '624136f727a742160027032b', '5d8c97a43666c358f6708ae2', '617602674ba06437e8d76d53', '6265ff9cad493f4db3ac0c0c', '64914670ed9aa21c8d568339', '6194b4a372d08b08b4d77974', '5de76d62880ae667c88c9447', '5f3f90f48c5f60027eee55f4', '61a4dde472d08b08b4d7864f', '61ad6faa9c79b860c8f9f39a', '6237e0dac82a7ea955331ac5', '622ff98127a742160026fcec', '5d8cbabb3666c358f6c026d1', '607ad0b5bd8287630d4e64e0', '628b071837ec447bff234bf5', '5ed87c351cb6477c2b28c40a', '5f439829acba9602a0705c8c', '63282b260ae1d271945eacc5', '62b03ab368b7606c0980b07a', '5d8c99223666c358f6708ae7', '6671338db98f74700f2ecd17', '628b071837ec447bff234be7', '618e0f8872d08b08b4d7734f', '6237e0dac82a7ea955331acb', '6162f29230d778359cacf8cf', '6343cc301e3ca36aa7690cc7', '5ed87b501cb6477c2b28c402', '5d89092e0384df7e7f11858a', '5d8c73da3666c358f655394a

In [23]:
print("In service_df but not in request_df:", len(service_ids_1 - service_ids_2))
print("In request_df but not in service_df:", len(service_ids_2 - service_ids_1))

In service_df but not in request_df: 1836
In request_df but not in service_df: 165


In [24]:
request_df["created_date"] = pd.to_datetime(request_df["created_date"])

In [25]:
all_users = set(citizen_df['userid']).union(set(request_df['userid']))
all_services = set(service_df['serviceid']).union(set(request_df['serviceid']))

In [26]:
len(all_users), len(all_services)

(50000, 2785)

In [27]:
user_map = {user_id: idx for idx, user_id in enumerate(sorted(all_users))}
service_map = {service_id: idx for idx, service_id in enumerate(sorted(all_services))}

In [28]:
with open('user_list.txt', 'w') as f:
    for user, idx in user_map.items():
        f.write(f"{user} {idx}\n")

with open('item_list.txt', 'w') as f:
    for service, idx in service_map.items():
        f.write(f"{service} {idx}\n")

In [29]:
request_df['userid'] = request_df['userid'].map(user_map)
request_df['serviceid'] = request_df['serviceid'].map(service_map)

In [30]:
request_df = request_df.sort_values(by='created_date')

In [31]:
split_idx = int(len(request_df) * 0.8)
train_df = request_df.iloc[:split_idx]
test_df = request_df.iloc[split_idx:]

In [32]:
missing_users = set(test_df['userid']) - set(train_df['userid'])
if missing_users:
    extra_train_data = test_df[test_df['userid'].isin(missing_users)]
    train_df = pd.concat([train_df, extra_train_data])
    test_df = test_df[~test_df['userid'].isin(missing_users)]

In [33]:
def save_format(filename, data):
    grouped = data.groupby('userid')['serviceid'].apply(lambda x: ' '.join(map(str, x)))
    with open(filename, 'w') as f:
        for user, items in grouped.items():
            f.write(f"{user} {items}\n")  
save_format('train.txt', train_df)
save_format('test.txt', test_df)

In [34]:
num_users = len(user_map)
num_items = len(service_map)

In [35]:
all_agency = set(service_df['govAgencyId'])
len(all_agency)

442

In [36]:
kg_entity_map = {agency_id: idx for idx, agency_id in enumerate(sorted(all_agency))}

In [37]:
entity2id = {}
    
for service, idx in service_map.items():
    entity2id[service] = idx
    
offset = len(service_map)
for other_entity, idx in kg_entity_map.items():
    entity2id[other_entity] = idx + offset


In [38]:
relation2id = {
    "provided_by": 0,
}

kg_triples = []

for _, row in service_df.iterrows():
    service_id = entity2id[row['serviceid']]
    agency_id = entity2id[row['govAgencyId']]
    kg_triples.append(f"{service_id} {relation2id['provided_by']} {agency_id}")

In [39]:
with open("kg_final.txt", "w") as f:
    for triple in kg_triples:
        f.write(triple + "\n")

with open("entity_list.txt", "w") as f:
    for ent, eid in entity2id.items():
        f.write(f"{ent} {eid}\n")

with open("relation_list.txt", "w") as f:
    for rel, rid in relation2id.items():
        f.write(f"{rel} {rid}\n")
