In [None]:
import pandas as pd
import numpy as np

## "Clean" work

In [None]:
import kagglehub

path = kagglehub.dataset_download("steve1215rogg/e-commerce-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/steve1215rogg/e-commerce-dataset?dataset_version_number=1...


100%|██████████| 89.8k/89.8k [00:00<00:00, 418kB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/steve1215rogg/e-commerce-dataset/versions/1





In [None]:
data = pd.read_csv(path+"/ecommerce_dataset_updated.csv")
data.head()

Unnamed: 0,User_ID,Product_ID,Category,Price (Rs.),Discount (%),Final_Price(Rs.),Payment_Method,Purchase_Date
0,337c166f,f414122f-e,Sports,36.53,15,31.05,Net Banking,12-11-2024
1,d38a19bf,fde50f9c-5,Clothing,232.79,20,186.23,Net Banking,09-02-2024
2,d7f5f0b0,0d96fc90-3,Sports,317.02,25,237.76,Credit Card,01-09-2024
3,395d4994,964fc44b-d,Toys,173.19,25,129.89,UPI,01-04-2024
4,a83c145c,d70e2fc6-e,Beauty,244.8,20,195.84,Net Banking,27-09-2024


In [None]:
data.columns = data.columns.str.lower()
data.rename(
    columns={
        'final_price(rs.)': 'final_price',
    },
    inplace=True
)

data.head()

Unnamed: 0,user_id,product_id,category,price (rs.),discount (%),final_price,payment_method,purchase_date
0,337c166f,f414122f-e,Sports,36.53,15,31.05,Net Banking,12-11-2024
1,d38a19bf,fde50f9c-5,Clothing,232.79,20,186.23,Net Banking,09-02-2024
2,d7f5f0b0,0d96fc90-3,Sports,317.02,25,237.76,Credit Card,01-09-2024
3,395d4994,964fc44b-d,Toys,173.19,25,129.89,UPI,01-04-2024
4,a83c145c,d70e2fc6-e,Beauty,244.8,20,195.84,Net Banking,27-09-2024


In [None]:
data = data[['product_id', 'category', 'final_price', 'purchase_date'] ]
data.head()

Unnamed: 0,product_id,category,final_price,purchase_date
0,f414122f-e,Sports,31.05,12-11-2024
1,fde50f9c-5,Clothing,186.23,09-02-2024
2,0d96fc90-3,Sports,237.76,01-09-2024
3,964fc44b-d,Toys,129.89,01-04-2024
4,d70e2fc6-e,Beauty,195.84,27-09-2024


In [None]:
from datetime import datetime, timedelta

def is_last_two_weeks_of_august(date):
    """
    Check if the given date is part of the last two weeks of August.
    """
    year = date.year
    start_of_last_two_weeks = datetime(year, 8, 18).date()  # Start date: August 18
    end_of_august = datetime(year, 8, 31).date()           # End date: August 31
    return start_of_last_two_weeks <= date <= end_of_august

def is_n_days_after(given_date, event_date, n):
    """
    """
    return event_date - timedelta(days=n) <=given_date <= event_date

In [None]:
important_days = ['al-Fitr', 'al-Adha']
important_ranges = ['school start', 'Ramadan']

import holidays

def extract_date(date_string):
  try:
    return pd.to_datetime(date_string).date()
  except ValueError:
    return None

ma_holidays = holidays.country_holidays('MA', years=[2025,2026])

In [None]:
def generate_feature_array(date):
    """

    """

    important_days = ['al-Fitr', 'al-Adha']
    feature_array = [0,0,0] # fitr, adha, then school year start (in order)

    ma_holidays = holidays.country_holidays('MA', years=[date.year, date.year+1])

    fitr = ma_holidays.get_named('al-Fitr')
    adha = ma_holidays.get_named('al-Adha')

    for day in fitr:
      if is_n_days_after(date, day, 35):
          feature_array[0] = 1
          break

    for day in adha:
      if is_n_days_after(date, day, 15):
           feature_array[1] = 1
           break


    if is_last_two_weeks_of_august(date):
          feature_array[2] = 1

    return feature_array

In [None]:
for date, name in sorted(ma_holidays.items()):
  print(date, name)

2025-01-01 New Year's Day
2025-01-11 Proclamation of Independence Day
2025-01-13 Amazigh New Year
2025-03-30 Eid al-Fitr (estimated)
2025-03-31 Eid al-Fitr (estimated)
2025-05-01 Labor Day
2025-06-06 Eid al-Adha (estimated)
2025-06-07 Eid al-Adha (estimated)
2025-06-26 Islamic New Year (estimated)
2025-07-30 Throne Day
2025-08-14 Oued Ed-Dahab Day
2025-08-20 Revolution Day
2025-08-21 Youth Day
2025-09-04 Prophet's Birthday (estimated)
2025-09-05 Prophet's Birthday (estimated)
2025-11-06 Green March
2025-11-18 Independence Day
2026-01-01 New Year's Day
2026-01-11 Proclamation of Independence Day
2026-01-13 Amazigh New Year
2026-03-20 Eid al-Fitr (estimated)
2026-03-21 Eid al-Fitr (estimated)
2026-05-01 Labor Day
2026-05-27 Eid al-Adha (estimated)
2026-05-28 Eid al-Adha (estimated)
2026-06-16 Islamic New Year (estimated)
2026-07-30 Throne Day
2026-08-14 Oued Ed-Dahab Day
2026-08-20 Revolution Day
2026-08-21 Youth Day
2026-08-25 Prophet's Birthday (estimated)
2026-08-26 Prophet's Birthday

In [None]:
for date_moment in sorted(ma_holidays.get_named('al-Fitr')):
  print(generate_feature_array(date_moment))

[1, 0, 0]
[1, 0, 0]
[1, 0, 0]
[1, 0, 0]


In [None]:
data_grouped_by_date = data.groupby('purchase_date')
data['fitr'], data['kbir'], data['school'] = zip(*data['purchase_date'].apply(lambda x: generate_feature_array(extract_date(x))))
data = data.sort_values(by=['purchase_date'])
encoded_categories = pd.get_dummies(data['category'], prefix='category')
data = pd.concat([data, encoded_categories], axis=1)
data.head()

  return pd.to_datetime(date_string).date()


Unnamed: 0,product_id,category,final_price,purchase_date,fitr,kbir,school,category_Beauty,category_Books,category_Clothing,category_Electronics,category_Home & Kitchen,category_Sports,category_Toys
2920,e3c44ef7-d,Home & Kitchen,89.13,01-01-2024,0,0,0,False,False,False,False,True,False,False
1062,44399f1d-0,Beauty,280.19,01-01-2024,0,0,0,True,False,False,False,False,False,False
2459,1791f55a-d,Sports,162.22,01-01-2024,0,0,0,False,False,False,False,False,True,False
2953,a8aaaab9-c,Electronics,27.62,01-01-2024,0,0,0,False,False,False,True,False,False,False
3173,ae024f47-f,Clothing,120.66,01-01-2024,0,0,0,False,False,True,False,False,False,False


In [None]:
data = data.drop(['product_id', 'category', 'purchase_date'], axis=1)
data.head()

Unnamed: 0,final_price,fitr,kbir,school,category_Beauty,category_Books,category_Clothing,category_Electronics,category_Home & Kitchen,category_Sports,category_Toys
2920,89.13,0,0,0,False,False,False,False,True,False,False
1062,280.19,0,0,0,True,False,False,False,False,False,False
2459,162.22,0,0,0,False,False,False,False,False,True,False
2953,27.62,0,0,0,False,False,False,True,False,False,False
3173,120.66,0,0,0,False,False,True,False,False,False,False


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X = data.drop('final_price', axis=1)  # Features (all columns except 'final_price')
y = data['final_price']              # Target variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [None]:
X_train_np = X_train.values  # Convert X_train to a NumPy array
X_test_np = X_test.values    # Convert X_test to a NumPy array

In [None]:
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train_np, y_train)  # Use the NumPy array for training

In [None]:
y_pred = model.predict(X_test_np)                                     # Make predictions on the test set
rmse = np.sqrt(mean_squared_error(y_test, y_pred))                # Calculate RMSE
print(f"Root Mean Squared Error: {rmse}")                          # Print the error metric

Root Mean Squared Error: 120.40572465647625


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': range(2, 10),
    'n_estimators': range(100, 5000, 100),
}

grid_search = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
                           param_grid=param_grid,
                           scoring='neg_root_mean_squared_error',
                           cv=5)

grid_search.fit(X_train_np, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
best_model.save_model('xgboost_model.json')

In [None]:
best_params

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 100}

In [None]:
# prompt: use the saved model 'xgboost_model.json' and use it to make a prediction

import xgboost as xgb
import pandas as pd
import numpy as np

# Load the saved model
model = xgb.XGBRegressor()
model.load_model('xgboost_model.json')

# Sample input data (replace with your actual data)
# Ensure the input features match the training data exactly
input_data = pd.DataFrame({
    'fitr': [0],
    'kbir': [1],
    'school': [0],
    'category_A': [1],
    'category_B': [0],
    'category_C': [0]
    # ... other columns
})


# Make predictions
X_input_np = input_data.values
predictions = model.predict(X_input_np)

print("Predictions:", predictions)

In [None]:
y_pred = best_model.predict(X_test_np)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 119.36517687799139


In [None]:
sklearn.__version__

NameError: name 'sklearn' is not defined

In [None]:
import xgboost
xgboost.__version__

'2.1.2'

## test

In [None]:
data = pd.read_csv('online_retail.csv')
data.head()

Unnamed: 0,index,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [None]:
data = data[['InvoiceNo', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'Country']]
data = data[data['Country']=='United Kingdom']
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32906 entries, 0 to 34741
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    32906 non-null  object 
 1   Description  32789 non-null  object 
 2   Quantity     32906 non-null  int64  
 3   InvoiceDate  32906 non-null  object 
 4   UnitPrice    32906 non-null  float64
 5   Country      32906 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 1.8+ MB


In [None]:
data.describe()

Unnamed: 0,Quantity,UnitPrice
count,32906.0,32906.0
mean,7.869933,6.370098
std,63.958385,158.115048
min,-9360.0,0.0
25%,1.0,1.28
50%,2.0,2.51
75%,6.0,4.25
max,2880.0,13541.33


In [None]:
data[data['UnitPrice']>0]

Unnamed: 0,InvoiceNo,Description,Quantity,InvoiceDate,UnitPrice,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,United Kingdom
1,536365,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,United Kingdom
...,...,...,...,...,...,...
34737,539423,RED RETROSPOT CAKE STAND,3,12/17/2010 14:24,10.95,United Kingdom
34738,539423,RED RETROSPOT MUG,6,12/17/2010 14:24,2.95,United Kingdom
34739,539423,DANISH ROSE FOLDING CHAIR,6,12/17/2010 14:24,7.95,United Kingdom
34740,539423,FELTCRAFT HAIRBAND PINK AND PURPLE,12,12/17/2010 14:24,0.85,United Kingdom


In [None]:
pip install --upgrade holidays



In [None]:
import holidays

def extract_date(date_string):
  try:
    return pd.to_datetime(date_string).date()
  except ValueError:
    return None

# Apply the function to create a new 'Date' column
data['Date'] = data['InvoiceDate'].apply(extract_date)
data.head()


NameError: name 'data' is not defined

In [None]:
ma_holidays = holidays.country_holidays('MA')# language='ar')

In [None]:
for date, name in sorted(ma_holidays.items()):
    print(date, name)

In [None]:
isHoliday = data['Date'].apply(lambda x: ma_holidays.get(x))
isHoliday[isHoliday.notna()]

Unnamed: 0,Date
14022,(تقدير) رأس السنة الهجرية
14023,(تقدير) رأس السنة الهجرية
14024,(تقدير) رأس السنة الهجرية
14025,(تقدير) رأس السنة الهجرية
14026,(تقدير) رأس السنة الهجرية
...,...
16980,(تقدير) رأس السنة الهجرية
16981,(تقدير) رأس السنة الهجرية
16982,(تقدير) رأس السنة الهجرية
16983,(تقدير) رأس السنة الهجرية
