讀取資料

In [None]:
import pandas as pd
import numpy as np

In [None]:
test_df = pd.read_csv("/content/drive/MyDrive/store-sales-time-series-forecasting/test.csv")
holiday_event_df = pd.read_csv("/content/drive/MyDrive/store-sales-time-series-forecasting/holidays_events.csv")
stores_df = pd.read_csv("/content/drive/MyDrive/store-sales-time-series-forecasting/stores.csv")
transaction_df = pd.read_csv("/content/drive/MyDrive/store-sales-time-series-forecasting/testing_transactions.csv")
oil_df = pd.read_csv("/content/drive/MyDrive/store-sales-time-series-forecasting/oil.csv")

合併資料

In [None]:
test_df['date'] = pd.to_datetime(test_df['date'])
holiday_event_df['date'] = pd.to_datetime(holiday_event_df['date'])
oil_df['date'] = pd.to_datetime(oil_df['date'])
transaction_df['date'] = pd.to_datetime(transaction_df['date'])

In [None]:
test_df = pd.merge(test_df, oil_df, on='date', how='left')
test_df = pd.merge(test_df, stores_df, on='store_nbr', how='left')
test_df = pd.merge(test_df, transaction_df, on=['date', 'store_nbr'], how='left')

In [None]:
# holiday_event_df如果是National的資料就按照date合併；是其他的資料就按照date、city合併
national_df = holiday_event_df[holiday_event_df['locale'] == 'National']
test_national_df = pd.merge(test_df, national_df, on='date', how='left')
non_national_df = holiday_event_df[holiday_event_df['locale'] != 'National']
test_non_national_df = pd.merge(test_df, non_national_df, left_on=['date', 'city'], right_on=['date', 'locale_name'], how='left')
test_df = test_national_df.combine_first(test_non_national_df)

欄位名稱轉換（因為原始資料集有兩個type欄位分別在holiday_event_df跟stores_df）

---



In [None]:
test_df = test_df.rename(columns={'type_x': 'store_type', 'type_y': 'event_type'})

特徵工程（1. 日期欄位拆分、2. 數值欄位 min-max normalization）

In [None]:
# 日期欄位拆分
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day
test_df['day_of_week'] = test_df['date'].dt.dayofweek

In [None]:
# 數值欄位 min-max normalization
columns_to_normalize = ['onpromotion', 'dcoilwtico', 'transactions']
test_df[columns_to_normalize] = (test_df[columns_to_normalize] - test_df[columns_to_normalize].min()) / (test_df[columns_to_normalize].max() - test_df[columns_to_normalize].min())

In [None]:
test_df

Unnamed: 0,id,date,store_nbr,family,onpromotion,dcoilwtico,city,state,store_type,cluster,transactions,event_type,locale,locale_name,description,transferred,year,month,day,day_of_week
0,3000888,2017-08-16,1,AUTOMOTIVE,0.000000,0.319392,Quito,Pichincha,D,13,0.143910,,,,,,2017,8,16,2
1,3000889,2017-08-16,1,BABY CARE,0.000000,0.319392,Quito,Pichincha,D,13,0.143910,,,,,,2017,8,16,2
2,3000890,2017-08-16,1,BEAUTY,0.003096,0.319392,Quito,Pichincha,D,13,0.143910,,,,,,2017,8,16,2
3,3000891,2017-08-16,1,BEVERAGES,0.030960,0.319392,Quito,Pichincha,D,13,0.143910,,,,,,2017,8,16,2
4,3000892,2017-08-16,1,BOOKS,0.000000,0.319392,Quito,Pichincha,D,13,0.143910,,,,,,2017,8,16,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,0.001548,0.494297,Quito,Pichincha,B,6,0.373143,,,,,,2017,8,31,3
28508,3029396,2017-08-31,9,PREPARED FOODS,0.000000,0.494297,Quito,Pichincha,B,6,0.373143,,,,,,2017,8,31,3
28509,3029397,2017-08-31,9,PRODUCE,0.001548,0.494297,Quito,Pichincha,B,6,0.373143,,,,,,2017,8,31,3
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,0.013932,0.494297,Quito,Pichincha,B,6,0.373143,,,,,,2017,8,31,3


test_df['family']、test_df['description']欄位用bert模型轉成詞向量

In [None]:
# family欄位轉為embedding
from transformers import BertModel, BertTokenizer
import torch
import pandas as pd

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
unique_families = test_df['family'].unique()
inputs = tokenizer(list(unique_families), padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

family_embedding_map = {family: embedding.tolist() for family, embedding in zip(unique_families, embeddings)}
test_df['bert_embeddings'] = test_df['family'].map(family_embedding_map)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
test_df = test_df.rename(columns={'bert_embeddings': 'family_bert_embeddings'})

In [None]:
# description欄位轉為embedding
from transformers import BertModel, BertTokenizer
import torch
import pandas as pd

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
test_df['description'] = test_df['description'].astype(str)

unique_description = test_df['description'].dropna().unique()
inputs = tokenizer(list(unique_description), padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

description_embedding_map = {description: embedding.tolist() for description, embedding in zip(unique_description, embeddings)}
test_df['description_bert_embeddings'] = test_df['description'].map(description_embedding_map)

新增經緯度欄位

In [None]:
cities_coordinates = {
    "Quito": {"lat": -0.1806532, "lon": -78.4678382},
    "Guayaquil": {"lat": -2.1709979, "lon": -79.9223592},
    "Santo Domingo": {"lat": -0.2530494, "lon": -79.1753765},
    "Cuenca": {"lat": -2.9001285, "lon": -79.0058965},
    "Latacunga": {"lat": -0.9393387, "lon": -78.6155545},
    "Manta": {"lat": -0.9676533, "lon": -80.7089101},
    "Machala": {"lat": -3.2581112, "lon": -79.9553924},
    "Ambato": {"lat": -1.2416666, "lon": -78.6195459},
    "Quevedo": {"lat": -1.0225124, "lon": -79.4604035},
    "Esmeraldas": {"lat": 0.9681789, "lon": -79.6517202},
    "Loja": {"lat": -3.9931283, "lon": -79.2042216},
    "Libertad": {"lat": -2.2311612, "lon": -80.9008852},
    "Playas": {"lat": -2.6284683, "lon": -80.3895886},
    "Daule": {"lat": -1.8621807, "lon": -79.9776688},
    "Babahoyo": {"lat": -1.8019264, "lon": -79.5346458},
    "Cayambe": {"lat": 0.0430556, "lon": -78.1459943},
    "Salinas": {"lat": -2.2171001, "lon": -80.9586051},
    "Puyo": {"lat": -1.4923925, "lon": -78.0024134},
    "Guaranda": {"lat": -1.5904732, "lon": -79.0022925},
    "Ibarra": {"lat": 0.3517083, "lon": -78.1223373},
    "Riobamba": {"lat": -1.6635508, "lon": -78.654646},
    "El Carmen": {"lat": -0.2687816, "lon": -79.466199}
}

test_df['coordinates'] = test_df['city'].map(cities_coordinates)

test_df['longitude'] = test_df['coordinates'].map(lambda x: x['lon'] if x is not None else None)
test_df['latitude'] = test_df['coordinates'].map(lambda x: x['lat'] if x is not None else None)

test_df.drop('coordinates', axis=1, inplace=True)

In [None]:
test_df

Unnamed: 0,id,date,store_nbr,family,onpromotion,dcoilwtico,city,state,store_type,cluster,...,description,transferred,year,month,day,day_of_week,family_bert_embeddings,description_bert_embeddings,longitude,latitude
0,3000888,2017-08-16,1,AUTOMOTIVE,0.000000,0.319392,Quito,Pichincha,D,13,...,,,2017,8,16,2,"[-0.12564779818058014, 0.09700334817171097, -0...","[-0.042501743882894516, 0.03686176612973213, 0...",-78.467838,-0.180653
1,3000889,2017-08-16,1,BABY CARE,0.000000,0.319392,Quito,Pichincha,D,13,...,,,2017,8,16,2,"[-0.026024261489510536, -0.01755763590335846, ...","[-0.042501743882894516, 0.03686176612973213, 0...",-78.467838,-0.180653
2,3000890,2017-08-16,1,BEAUTY,0.003096,0.319392,Quito,Pichincha,D,13,...,,,2017,8,16,2,"[-0.27227261662483215, 0.08034417033195496, -0...","[-0.042501743882894516, 0.03686176612973213, 0...",-78.467838,-0.180653
3,3000891,2017-08-16,1,BEVERAGES,0.030960,0.319392,Quito,Pichincha,D,13,...,,,2017,8,16,2,"[-0.3421946167945862, 0.4779342710971832, -0.2...","[-0.042501743882894516, 0.03686176612973213, 0...",-78.467838,-0.180653
4,3000892,2017-08-16,1,BOOKS,0.000000,0.319392,Quito,Pichincha,D,13,...,,,2017,8,16,2,"[-0.10323282331228256, 0.15401765704154968, -0...","[-0.042501743882894516, 0.03686176612973213, 0...",-78.467838,-0.180653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,0.001548,0.494297,Quito,Pichincha,B,6,...,,,2017,8,31,3,"[-0.5122681260108948, 0.09797726571559906, -0....","[-0.042501743882894516, 0.03686176612973213, 0...",-78.467838,-0.180653
28508,3029396,2017-08-31,9,PREPARED FOODS,0.000000,0.494297,Quito,Pichincha,B,6,...,,,2017,8,31,3,"[0.006092589348554611, 0.2736836075782776, -0....","[-0.042501743882894516, 0.03686176612973213, 0...",-78.467838,-0.180653
28509,3029397,2017-08-31,9,PRODUCE,0.001548,0.494297,Quito,Pichincha,B,6,...,,,2017,8,31,3,"[-0.5437078475952148, 0.41101962327957153, -0....","[-0.042501743882894516, 0.03686176612973213, 0...",-78.467838,-0.180653
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,0.013932,0.494297,Quito,Pichincha,B,6,...,,,2017,8,31,3,"[-0.2551140785217285, 0.32630085945129395, -0....","[-0.042501743882894516, 0.03686176612973213, 0...",-78.467838,-0.180653


In [None]:
# test_df.to_csv(f"DM Final Project testing data.csv.gz", compression='gzip', index=False)