讀取資料

In [1]:
import pandas as pd

train_data = pd.read_csv("train_preprocessed.csv")
test_data = pd.read_csv("test_preprocessed.csv")


  train_data = pd.read_csv("train_preprocessed.csv")


特徵工程 - 訓練集

In [2]:
train_data = train_data.rename(columns={'locale_x':'locale', 'locale_name_x':'locale_name', 'description_x':'description', 'transferred_x':'transferred'})
train_data.drop(['locale_y', 'locale_name_y', 'description_y', 'transferred_y'], axis=1, inplace=True)

In [3]:
print(train_data.columns)
print(test_data.columns)

Index(['id', 'date', 'store_nbr', 'family', 'onpromotion', 'dcoilwtico',
       'city', 'state', 'store_type', 'cluster', 'transactions', 'event_type',
       'locale', 'locale_name', 'description', 'transferred', 'type', 'year',
       'month', 'day', 'day_of_week', 'family_bert_embeddings',
       'description_bert_embeddings', 'longitude', 'latitude', 'sales',
       'isHoliday', 'isEvent'],
      dtype='object')
Index(['id', 'date', 'store_nbr', 'family', 'onpromotion', 'dcoilwtico',
       'city', 'state', 'store_type', 'cluster', 'transactions', 'event_type',
       'locale', 'locale_name', 'description', 'transferred', 'year', 'month',
       'day', 'day_of_week', 'family_bert_embeddings',
       'description_bert_embeddings', 'longitude', 'latitude', 'isHoliday',
       'isEvent'],
      dtype='object')


In [11]:
import numpy as np
from sklearn.decomposition import PCA

# 建立處理嵌入向量欄位函數
def process_embeddings(df, column_name):
    embeddings = df[column_name].apply(lambda x: np.fromstring(x.strip("[]"), sep=',')).values
    return np.vstack(embeddings)

# 批量處理嵌入向量，避免記憶體問題
batch_size = 1000
n_batches = len(train_data) // batch_size + 1

family_embeddings = []
description_embeddings = []

for i in range(n_batches):
    batch_data = train_data.iloc[i * batch_size: (i + 1) * batch_size]
    family_embeddings.append(process_embeddings(batch_data, 'family_bert_embeddings'))
    description_embeddings.append(process_embeddings(batch_data, 'description_bert_embeddings'))

family_embeddings = np.vstack(family_embeddings)
description_embeddings = np.vstack(description_embeddings)

# 使用 PCA 將 Word2Vec 向量降維
pca = PCA(n_components=10)
family_reduced = pca.fit_transform(family_embeddings)
description_reduced = pca.fit_transform(description_embeddings)

# 將降維後的向量轉換為 DataFrame
family_reduced_df = pd.DataFrame(family_reduced, columns=[f'family_{i}' for i in range(10)])
description_reduced_df = pd.DataFrame(description_reduced, columns=[f'description_{i}' for i in range(10)])

# 合併嵌入向量欄位到原始資料
train_data = pd.concat([train_data, family_reduced_df, description_reduced_df], axis=1)
train_data.drop(['family_bert_embeddings', 'description_bert_embeddings'], axis=1, inplace=True)


In [12]:
from sklearn.model_selection import train_test_split

# 選擇特徵和目標變數
features = ['onpromotion', 'dcoilwtico', 'transactions', 'transferred', 'year', 'month', 'day', 'day_of_week', 'longitude', 'latitude', 'isHoliday', 'isEvent','store_nbr', 'store_type', 'cluster']
features += [f'family_{i}' for i in range(10)] + [f'description_{i}' for i in range(10)]
target = 'sales'

# 處理缺失值
train_data = train_data.fillna(0)

# 選取需要的欄位資料
feature_df = train_data[features]

# 將資料集中的 'True' 和 'False' 取代為 1 和 0
feature_df = feature_df.replace('True', 1)
feature_df = feature_df.replace('False', 0)

# one-hot encoding
feature_df = pd.get_dummies(feature_df, columns=['store_nbr', 'store_type', 'cluster'])

# 特徵和目標變數
X = feature_df
y = train_data[target]

# 訓練集和測試集分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

模型評估

Multi-index Linear

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

# 初始化線性迴歸模型
linear_model = LinearRegression()

# 訓練模型
linear_model.fit(X_train, y_train)

# 後處理函數（預測結果最小為0）
def postprocess_predictions(predictions):
    return np.maximum(predictions, 0)

# 驗證
y_pred = linear_model.predict(X_val)
y_pred = postprocess_predictions(y_pred)

# 評估模型
rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
print(f'Simple Linear Regression RMSLE: {rmsle}')


Simple Linear Regression RMSLE: 2.702412208300194


In [10]:
# 觀察特徵模型特徵重要性

# 獲取特徵重要性
importance = linear_model.coef_

# 將特徵名稱和其重要性存入DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

         Feature    Importance
66  store_type_A  6.253517e+13
65  store_nbr_54  5.787804e+13
76     cluster_6  5.766866e+13
27  store_nbr_16  4.980347e+13
16   store_nbr_5  4.535045e+13
..           ...           ...
61  store_nbr_50 -3.964648e+13
60  store_nbr_49 -4.064981e+13
56  store_nbr_45 -4.064981e+13
55  store_nbr_44 -4.521478e+13
68  store_type_C -4.819928e+13

[88 rows x 2 columns]


Moving Average (MA)

In [17]:
# 選定視窗大小初始化模型
window_size = 3
moving_average_pred = y_val.rolling(window=window_size).mean().shift(1)

# 填補NaN值
moving_average_pred = moving_average_pred.fillna(method='bfill')

# 評估模型
rmsle = np.sqrt(mean_squared_log_error(y_val, moving_average_pred))
print(f'Moving Average Model RMSLE: {rmsle}')


Moving Average Model RMSLE: 3.4703451191046093


  moving_average_pred = moving_average_pred.fillna(method='bfill')


Exponential Smoothing

In [19]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# 初始化模型
exponential_smoothing_model = ExponentialSmoothing(y_train, seasonal='add', seasonal_periods=12).fit()
y_exp_pred = exponential_smoothing_model.forecast(len(y_val))

# 評估模型
rmsle = np.sqrt(mean_squared_log_error(y_val, y_exp_pred))
print(f'Exponential Smoothing Model RMSLE: {rmsle}')


  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Exponential Smoothing Model RMSLE: 3.8130367540262813


ARIMA Models

In [22]:
from statsmodels.tsa.arima.model import ARIMA

# 初始化模型
arima_model = ARIMA(y_train, order=(5,1,0))
arima_model_fit = arima_model.fit()
y_arima_pred = arima_model_fit.forecast(steps=len(y_val))

# 評估模型
rmsle = np.sqrt(mean_squared_log_error(y_val, y_arima_pred))
print(f'ARIMA Model RMSLE: {rmsle}')


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  s = divide(1, s, where=large, out=s)


MemoryError: Unable to allocate 39.7 MiB for an array with shape (5, 1040682) and data type float64

特徵工程 - 測試集

In [5]:
# 處理嵌入向量欄位
def process_embeddings(df, column_name):
    embeddings = df[column_name].apply(lambda x: np.fromstring(x.strip("[]"), sep=',')).values
    return np.vstack(embeddings)

# 批量處理嵌入向量，避免記憶體問題
batch_size = 1000
n_batches = len(test_data) // batch_size + 1

family_embeddings = []
description_embeddings = []

for i in range(n_batches):
    batch_data = test_data.iloc[i * batch_size: (i + 1) * batch_size]
    family_embeddings.append(process_embeddings(batch_data, 'family_bert_embeddings'))
    description_embeddings.append(process_embeddings(batch_data, 'description_bert_embeddings'))

family_embeddings = np.vstack(family_embeddings)
description_embeddings = np.vstack(description_embeddings)

# 使用 PCA 將 Word2Vec 向量降維
pca = PCA(n_components=10)
family_reduced = pca.fit_transform(family_embeddings)
description_reduced = pca.fit_transform(description_embeddings)

# 將降維後的向量轉換為 DataFrame
family_reduced_df = pd.DataFrame(family_reduced, columns=[f'family_{i}' for i in range(10)])
description_reduced_df = pd.DataFrame(description_reduced, columns=[f'description_{i}' for i in range(10)])

# 合併嵌入向量欄位到原始資料
test_data = pd.concat([test_data, family_reduced_df, description_reduced_df], axis=1)
test_data.drop(['family_bert_embeddings', 'description_bert_embeddings'], axis=1, inplace=True)


In [14]:
# 選擇特徵和目標變數
features = ['onpromotion', 'dcoilwtico', 'transactions', 'transferred', 'year', 'month', 'day', 'day_of_week', 'longitude', 'latitude', 'isHoliday', 'isEvent','store_nbr', 'store_type', 'cluster']
features += [f'family_{i}' for i in range(10)] + [f'description_{i}' for i in range(10)]
target = 'sales'

# 處理缺失值
test_data = test_data.fillna(0)

# 選取需要的欄位資料
test_feature_df = test_data[features]

# 將資料集中的 'True' 和 'False' 取代為 1 和 0
test_feature_df = test_feature_df.replace('True', 1)
test_feature_df = test_feature_df.replace('False', 0)

# one-hot encoding
test_feature_df = pd.get_dummies(test_feature_df, columns=['store_nbr', 'store_type', 'cluster'])

X_pred_test = test_feature_df


模型預測並輸出

In [15]:
# Simple Linear Regression

# 特徵和目標變數
y_pred_test = linear_model.predict(X_pred_test)
y_pred_test = postprocess_predictions(y_pred_test)
y_test_data = pd.DataFrame({'id': test_data['id'], 'sales': y_pred_test})

# 輸出結果
y_test_data.to_csv('result_slr.csv', index=False)

In [18]:
# Moving Average 

window_size = 3  # 根據需要調整視窗大小
feature_df['sales_MA'] = train_data['sales'].rolling(window=window_size).mean()

# 預測測試集的銷售量
y_pred_ma = feature_df['sales_MA'].iloc[-len(test_data):].fillna(method='backfill').values
y_pred_ma = np.maximum(y_pred_ma, 0)
y_test_data = pd.DataFrame({'id': test_data['id'], 'sales': y_pred_ma})

# 輸出結果
y_test_data.to_csv('result_ma.csv', index=False)

  y_pred_ma = feature_df['sales_MA'].iloc[-len(test_data):].fillna(method='backfill').values


In [20]:
# Exponential Smoothing

exponential_smoothing_model = ExponentialSmoothing(train_data['sales'], trend='add', seasonal=None, seasonal_periods=None)
exponential_smoothing_fit = exponential_smoothing_model.fit()

# 預測測試集的銷售量
y_pred_es = exponential_smoothing_fit.forecast(steps=len(test_data)).values
y_pred_es = np.maximum(y_pred_es, 0)
y_test_data = pd.DataFrame({'id': test_data['id'], 'sales': y_pred_es})

# 輸出結果
y_test_data.to_csv('result_es.csv', index=False)



In [73]:
# ARIMA

arima_model = ARIMA(train_data['sales'], order=(5, 1, 0))
arima_fit = arima_model.fit()

# 預測測試集的銷售量
y_pred_arima = arima_fit.forecast(steps=len(test_data)).values
y_pred_arima = np.maximum(y_pred_arima, 0)
y_test_data = pd.DataFrame({'id': test_data['id'], 'sales': y_pred_arima})

y_test_data.to_csv('result_arima.csv', index=False)