In [1]:
import pandas as pd

In [2]:
citizen_df = pd.read_excel('../../excel-files/citizen.xlsx')
service_df = pd.read_excel('../../excel-files/emon.service.xlsx')
request_df_sheets = pd.read_excel('../../excel-files/request.xlsx', sheet_name=None)

In [3]:
request_df = pd.concat(request_df_sheets.values())

In [4]:
request_df.rename(columns={"createc_date": "created_date", "service_id": "serviceid"}, inplace=True)
service_df.rename(columns={"_id": "serviceid"}, inplace=True)

In [5]:
request_df["created_date"] = pd.to_datetime(request_df["created_date"])

In [6]:
all_users = set(citizen_df['userid']).union(set(request_df['userid']))
all_services = set(service_df['serviceid']).union(set(request_df['serviceid']))

In [7]:
len(all_users), len(all_services)

(50000, 2785)

In [8]:
user_map = {user_id: idx for idx, user_id in enumerate(sorted(all_users))}
service_map = {service_id: idx for idx, service_id in enumerate(sorted(all_services))}

In [9]:
with open('user_list.txt', 'w') as f:
    for user, idx in user_map.items():
        f.write(f"{user} {idx}\n")

with open('item_list.txt', 'w') as f:
    for service, idx in service_map.items():
        f.write(f"{service} {idx}\n")

In [10]:
request_df['userid'] = request_df['userid'].map(user_map)
request_df['serviceid'] = request_df['serviceid'].map(service_map)

In [11]:
request_df = request_df.sort_values(by='created_date')

In [12]:
split_idx = int(len(request_df) * 0.8)
train_df = request_df.iloc[:split_idx]
test_df = request_df.iloc[split_idx:]

In [13]:
missing_users = set(test_df['userid']) - set(train_df['userid'])
if missing_users:
    extra_train_data = test_df[test_df['userid'].isin(missing_users)]
    train_df = pd.concat([train_df, extra_train_data])
    test_df = test_df[~test_df['userid'].isin(missing_users)]

In [14]:
def save_format(filename, data):
    grouped = data.groupby('userid')['serviceid'].apply(lambda x: ' '.join(map(str, x)))
    with open(filename, 'w') as f:
        for user, items in grouped.items():
            f.write(f"{user} {items}\n")  
save_format('train.txt', train_df)
save_format('test.txt', test_df)

In [15]:
num_users = len(user_map)
num_items = len(service_map)

In [16]:
all_agency = set(service_df['govAgencyId'])
all_service_types = set(service_df["serviceType"])
len(all_agency), len(all_service_types)

(442, 8)

In [17]:
kg_entity_map = {agency_id: idx for idx, agency_id in enumerate(sorted(all_agency))}
kg_entity_map.update({service_type: idx + len(all_agency) for idx, service_type in enumerate(sorted(all_service_types))})

In [18]:
entity2id = {}
    
for service, idx in service_map.items():
    entity2id[service] = idx
    
offset = len(service_map)
for other_entity, idx in kg_entity_map.items():
    entity2id[other_entity] = idx + offset


In [19]:
relation2id = {
    "provided_by": 0,
    "type_of": 1,
}

kg_triples = []

for _, row in service_df.iterrows():
    service_id = entity2id[row['serviceid']]
    agency_id = entity2id[row['govAgencyId']]
    service_type_id = entity2id[row['serviceType']]
    kg_triples.append(f"{service_id} {relation2id['provided_by']} {agency_id}")
    kg_triples.append(f"{service_id} {relation2id['type_of']} {service_type_id}")

In [20]:
with open("kg.txt", "w") as f:
    for triple in kg_triples:
        f.write(triple + "\n")

with open("entity_list.txt", "w") as f:
    for ent, eid in entity2id.items():
        f.write(f"{ent} {eid}\n")

with open("relation_list.txt", "w") as f:
    for rel, rid in relation2id.items():
        f.write(f"{rel} {rid}\n")
