In [7]:
DEBUG = True

In [10]:
import pandas as pd
from embeddings import TFIDFEmbedding
import numpy as np

# Load raw customer data
raw_data_path = '../input_data/DATASET.xlsx'
customer_df = pd.read_excel(raw_data_path, sheet_name='customer')
if DEBUG:
    customer_df = customer_df[:100]  # For debugging, limit to first 100 rows

# Load street-district mapping
street_district_path = '../processed_data/street_district.csv'
street_district_df = pd.read_csv(street_district_path)
street_district_dict = street_district_df.set_index('street')['district'].to_dict()
street_district_combined = set(street_district_dict.keys()).union(set(street_district_dict.values()))

# Load precomputed TF-IDF embeddings
embedding_model = TFIDFEmbedding('../processed_data/embedding.pkl')

# Display first rows of customer data
customer_df.head()

Unnamed: 0,customerid,DOB,gender,address,Website,job,industry
0,14,36268,Nữ,hoa khe quan thanh khe,KH0104|0345,student,computer
1,34,35901,Nam,man thai,KH0104|0333,student,health service
2,51,34319,Nam,que son quang nam,KH0104|0255,blue collar,economics
3,81,36472,Nam,truong chinh,KH0104|0293,student,economics
4,98,34608,Nam,cẩm lệ,KH0104|40580,blue collar,health service


In [11]:
# Make a copy of customer_df to new_customer_df so that the original remains unchanged
new_customer_df = customer_df.copy()

# Update the 'address' column in new_customer_df by matching customer addresses
new_customer_df['address'] = new_customer_df['address'].apply(
    lambda addr: embedding_model.query(addr, list(street_district_combined), 1)[0] if pd.notnull(addr) else None
)

new_customer_df['address'] = new_customer_df['address'].apply(
    lambda x: x if x in street_district_dict.values() else (street_district_dict[x] if x in street_district_dict else x)
)

# Display the updated new_customer_df head
new_customer_df.head()


Unnamed: 0,customerid,DOB,gender,address,Website,job,industry
0,14,36268,Nữ,Quận Ngũ Hành Sơn,KH0104|0345,student,computer
1,34,35901,Nam,Quận Hải Châu,KH0104|0333,student,health service
2,51,34319,Nam,Quận Hải Châu,KH0104|0255,blue collar,economics
3,81,36472,Nam,Quận Sơn Trà,KH0104|0293,student,economics
4,98,34608,Nam,Quận Cẩm Lệ,KH0104|40580,blue collar,health service


In [12]:
customer_df

Unnamed: 0,customerid,DOB,gender,address,Website,job,industry
0,0000000014,36268,Nữ,hoa khe quan thanh khe,KH0104|0345,student,computer
1,0000000034,35901,Nam,man thai,KH0104|0333,student,health service
2,0000000051,34319,Nam,que son quang nam,KH0104|0255,blue collar,economics
3,0000000081,36472,Nam,truong chinh,KH0104|0293,student,economics
4,0000000098,34608,Nam,cẩm lệ,KH0104|40580,blue collar,health service
...,...,...,...,...,...,...,...
95,0000010136,33949,Nam,hải châu,KH0104|18536,specialist,health service
96,0000010165,34382,Nữ,liên chiểu,KH0104|18442,specialist,construction
97,0000010196,34058,Nữ,HAI PHONG DA NANG,KH0104|18241,blue collar,health service
98,0000010231,35098,Nam,sơn trà,KH0104|16066,blue collar,finance


In [13]:
new_customer_df

Unnamed: 0,customerid,DOB,gender,address,Website,job,industry
0,0000000014,36268,Nữ,Quận Ngũ Hành Sơn,KH0104|0345,student,computer
1,0000000034,35901,Nam,Quận Hải Châu,KH0104|0333,student,health service
2,0000000051,34319,Nam,Quận Hải Châu,KH0104|0255,blue collar,economics
3,0000000081,36472,Nam,Quận Sơn Trà,KH0104|0293,student,economics
4,0000000098,34608,Nam,Quận Cẩm Lệ,KH0104|40580,blue collar,health service
...,...,...,...,...,...,...,...
95,0000010136,33949,Nam,Quận Hải Châu,KH0104|18536,specialist,health service
96,0000010165,34382,Nữ,Quận Liên Chiểu,KH0104|18442,specialist,construction
97,0000010196,34058,Nữ,Quận Cẩm Lệ,KH0104|18241,blue collar,health service
98,0000010231,35098,Nam,Quận Ngũ Hành Sơn,KH0104|16066,blue collar,finance


In [None]:
print(embedding_model.vectorizer)

TfidfVectorizer()
