In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/content/train.csv')
df = df.sort_values('Date')

  df = pd.read_csv('/content/train.csv')


In [3]:
# feature extraction

df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x in [6, 7] else 0)

df['Sales7daysago'] = df.groupby('Store')['Sales'].shift(7)
df['Sales14daysago'] = df.groupby('Store')['Sales'].shift(14)
df['Sales28daysago'] = df.groupby('Store')['Sales'].shift(28)

df['RollingMean7'] = df.groupby('Store')['Sales'].transform(lambda x: x.shift(1).rolling(window=7).mean())

In [4]:
df.isna().sum()

Unnamed: 0,0
Store,0
DayOfWeek,0
Date,0
Sales,0
Customers,0
Open,0
Promo,0
StateHoliday,0
SchoolHoliday,0
Year,0


In [5]:
# data cleaning

df = df.dropna(axis=0)

df['StateHoliday'] = df['StateHoliday'].astype(str).map({'0':0, 'a':1, 'b':2, 'c':3})

In [6]:
store_info = pd.read_csv('/content/store.csv')

df = pd.merge(df, store_info, on='Store', how='left')

df['CompetitionDistance'] = df['CompetitionDistance'].fillna(df['CompetitionDistance'].median())
df['CompetitionOpenSinceYear'] = df['CompetitionOpenSinceYear'].fillna(2000)

df['StoreType'] = df['StoreType'].map({'a':1, 'b':2, 'c':3, 'd':4})
df['Assortment'] = df['Assortment'].map({'a':1, 'b':2, 'c':3})
df['PromoInterval'] = df['PromoInterval'].map({'Jan,Apr,Jul,Oct':1, 'Feb,May,Aug,Nov':2, 'Mar,Jun,Sep,Dec':3})
df['PromoInterval'] = df['PromoInterval'].fillna(0)

In [7]:
# splitting

split_date = df['Date'].max() - pd.Timedelta(days=6*7)

train = df[df['Date'] < split_date]
train = train[train['Open'] == 1]

test = df[df['Date'] >= split_date]

store_avg = train.groupby('Store')['Sales'].mean().to_dict()

store_dow_avg = train.groupby(['Store', 'DayOfWeek'])['Sales'].mean().reset_index()
store_dow_avg.columns = ['Store', 'DayOfWeek', 'StoreDowAvg']

train['StoreAvg'] = train['Store'].map(store_avg)
test['StoreAvg'] = test['Store'].map(store_avg)

train_data = pd.merge(train, store_dow_avg, on=['Store', 'DayOfWeek'], how='left')
test_data = pd.merge(test, store_dow_avg, on=['Store', 'DayOfWeek'], how='left')

train_data = train_data[train_data['Open'] == 1]

drop_cols = ['Sales', 'Date', 'Customers']

X_train = train_data.drop(drop_cols, axis=1)
y_train = train_data['Sales']

X_test = test_data.drop(drop_cols, axis=1)
y_test = test_data['Sales']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['StoreAvg'] = test['Store'].map(store_avg)


In [8]:
!pip install xgboost



In [9]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

def rmspe(y_true, y_pred):
    mask = y_true != 0
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    diff = (y_true - y_pred) / y_true
    return np.sqrt(np.mean(diff**2))


y_train_log = np.log1p(y_train)

model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)

model.fit(X_train, y_train_log)

pred_log = model.predict(X_test)
pred = np.expm1(pred_log)

if 'Open' in X_test.columns:
    pred[X_test['Open'] == 0] = 0

print(f"RMSPE: {rmspe(y_test, pred)}")

RMSPE: 0.12459077623434212
