In [48]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [49]:
train = pd.read_csv('../data/raw/train.csv', dtype={'StateHoliday': str})
store = pd.read_csv('../data/raw/store.csv')

df = pd.merge(train, store, on='Store', how='left')
df.to_csv('../data/raw/merge.csv', index=False)

In [50]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1017209 non-null  int64  
 1   DayOfWeek                  1017209 non-null  int64  
 2   Date                       1017209 non-null  str    
 3   Sales                      1017209 non-null  int64  
 4   Customers                  1017209 non-null  int64  
 5   Open                       1017209 non-null  int64  
 6   Promo                      1017209 non-null  int64  
 7   StateHoliday               1017209 non-null  str    
 8   SchoolHoliday              1017209 non-null  int64  
 9   StoreType                  1017209 non-null  str    
 10  Assortment                 1017209 non-null  str    
 11  CompetitionDistance        1014567 non-null  float64
 12  CompetitionOpenSinceMonth  693861 non-null   float64
 13  CompetitionOpenSinceYea

In [51]:
df.isnull().mean() * 100

Store                         0.000000
DayOfWeek                     0.000000
Date                          0.000000
Sales                         0.000000
Customers                     0.000000
Open                          0.000000
Promo                         0.000000
StateHoliday                  0.000000
SchoolHoliday                 0.000000
StoreType                     0.000000
Assortment                    0.000000
CompetitionDistance           0.259730
CompetitionOpenSinceMonth    31.787764
CompetitionOpenSinceYear     31.787764
Promo2                        0.000000
Promo2SinceWeek              49.943620
Promo2SinceYear              49.943620
PromoInterval                49.943620
dtype: float64

In [52]:
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.day_of_week  # 0 = Monday, 6 = Sunday

df = df.drop('Date', axis=1)


In [53]:
# فاصله → میانه (مقاوم‌تر به داده‌های پرت)
df['CompetitionDistance'] = df['CompetitionDistance'].fillna(df['CompetitionDistance'].median())

# اگر رقابتی وجود ندارد → علامت‌گذاری واضح
df['CompetitionOpenSinceMonth'] = df['CompetitionOpenSinceMonth'].fillna(0)
df['CompetitionOpenSinceYear'] = df['CompetitionOpenSinceYear'].fillna(0)

# Promo2 فعال نیست
df['Promo2SinceWeek'] = df['Promo2SinceWeek'].fillna(0)
df['Promo2SinceYear'] = df['Promo2SinceYear'].fillna(0)
df['PromoInterval'] = df['PromoInterval'].fillna('None')

df = df[df['Open'] == 1]
# دیگه به Open نیازی نداریم
df = df.drop(columns=['Open'], errors='ignore')

In [54]:
# -------------------------
# ۳️⃣ One-Hot Encoding تمام ویژگی‌های دسته‌ای
# -------------------------

# ۱. تمیز کردن StateHoliday (خیلی مهم - گاهی 0 و '0' قاطی می‌شن)
df['StateHoliday'] = df['StateHoliday'].astype(str).replace('0', 'None')

# اضافه کردن یک ستون باینری ساده (خیلی به مدل کمک می‌کنه)
df['IsStateHoliday'] = (df['StateHoliday'] != 'None').astype(int)

# ۲. لیست ستون‌هایی که می‌خواهیم One-Hot کنیم
categorical_columns = [
    'StateHoliday',
    'StoreType',
    'Assortment',
    'PromoInterval'
]

# ۳. انجام One-Hot Encoding با pandas (ساده و قابل کنترل)
df = pd.get_dummies(
    df,
    columns=categorical_columns,
    prefix=categorical_columns,  # اسم ستون‌ها خوانا بمونه
    prefix_sep='_',
    drop_first=False,  # همه دسته‌ها رو نگه می‌داریم
    dtype=int  # 0 و 1 به جای True/False
)
df.to_csv('../data/preprocessed/preprocessed.csv', index=False)

In [56]:
X = df.drop('Sales', axis=1)
y = df['Sales']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [57]:
model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))

MAE:  939.3476991183587
RMSE:  1294.0086862272244


In [58]:
smape = np.mean(
    2 * np.abs(y_pred - y_test) /
    (np.abs(y_test) + np.abs(y_pred))
) * 100

print("SMAPE:", smape)

SMAPE: 13.929121628297239


In [59]:
import joblib

joblib.dump(model, "../model/model.pkl")

['../model/model.pkl']