In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [2]:
sales_data = pd.read_excel('datas.xlsx')
print(sales_data.head())

  order_date      city  unit  unit_price  total_price  customer_order_number  \
0 2024-01-01   Kocaeli     1       62.00         62.0                      1   
1 2024-01-01  İstanbul     1      135.00        135.0                      1   
2 2024-01-01     Kilis     4       33.75        135.0                      1   
3 2024-01-01     Kilis     1       36.00         36.0                      1   
4 2024-01-01  İstanbul     1       63.00         63.0                      2   

   age      sex  
0  NaN  unknown  
1  NaN  unknown  
2  NaN  unknown  
3  NaN  unknown  
4  NaN  unknown  


In [3]:
total_sales_inflation_data = pd.read_excel('total_sales_inflation.xlsx')
print(total_sales_inflation_data.head())

   month  total_sales  inflation_rate
0      1      2135379           49.38
1      2      2158681           51.97
2      3      2493503           61.78
3      4      2252183           71.60
4      5      2595595           75.45


In [4]:
import pandas as pd

sales_data['order_date'] = pd.to_datetime(sales_data['order_date'])
sales_data['year'] = sales_data['order_date'].dt.year
sales_data['month'] = sales_data['order_date'].dt.month

monthly_totals = sales_data.groupby(['year', 'month']).agg({
    'unit': 'sum',  # Toplam satılan birim sayısı
    'unit_price': 'mean',  # Ortalama birim fiyatı
    'total_price': 'sum',  # Aylık toplam satış tutarı
    'order_date': 'count'  # Aylık toplam sipariş sayısı (kaç sipariş alındı)
}).reset_index()

# 'order_date' sütununu yeniden adlandırarak 'total_orders' olarak ekliyoruz
monthly_totals.rename(columns={'order_date': 'total_orders'}, inplace=True)

# Enflasyon verisiyle birleştirme (ay bazında)
merged_data = pd.merge(monthly_totals, total_sales_inflation_data, on='month')

# Gerekli sütunları seçme: month, unit, unit_price, total_price, inflation_rate, total_orders
final_data = merged_data[['month', 'unit', 'unit_price', 'total_price', 'inflation_rate', 'total_orders']].copy()

# 1. Aylık Ortalama Sipariş Başına Satılan Birim (unit_per_order)
final_data.loc[:, 'unit_per_order'] = final_data['unit'] / final_data['total_orders']

# 5. Aylık Satış Büyüme Oranı (sales_growth)
# final_data.loc[:, 'sales_growth'] = final_data['total_price'].pct_change() * 100

# Sütun isimlerini daha açıklayıcı hale getirme
final_data.columns = ['month', 'unit', 'unit_price', 'total_sales', 'inflation_rate', 'total_orders', 'unit_per_order']

# Sonuçları yazdırma
print(final_data)


   month   unit  unit_price  total_sales  inflation_rate  total_orders  \
0      1  34697   66.918457   2135379.51           49.38         22652   
1      2  29974   76.413234   2158681.92           51.97         20002   
2      3  34295   78.751181   2493503.42           61.78         22747   
3      4  32692   78.716175   2252183.53           71.60         18900   
4      5  42824   69.836219   2595595.95           75.45         24340   
5      6  37548   67.042899   2169907.92           69.80         20676   
6      7  37528   83.271076   2409347.71           68.50         20175   
7      8  30205  105.253619   2223324.68           67.07         14837   
8      9  30079   78.809952   1873008.03           64.86         14944   

   unit_per_order  
0        1.531741  
1        1.498550  
2        1.507671  
3        1.729735  
4        1.759408  
5        1.816019  
6        1.860124  
7        2.035789  
8        2.012781  


In [5]:
# Özellikler (X) ve hedef değişkeni (y) ayırma
X = final_data[['month', 'unit', 'unit_price', 'inflation_rate', 'total_orders', 'unit_per_order']].values
y = final_data['total_sales'].values

# Veriyi eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Verileri ölçeklendirme
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [55]:
# XGBoost modelini oluşturma
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

param_distributions = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 9, 11],
    'n_estimators': [100, 500, 1000, 2000],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0], 
    'gamma': [0, 0.1, 0.3, 0.5, 0.7]
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=1 
)
# Modeli tüm veri seti üzerinde eğitme
random_search.fit(X, y)

# En iyi parametreleri bulma
print(f'Best parameters found: {random_search.best_params_}')

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found: {'subsample': 0.6, 'n_estimators': 500, 'max_depth': 11, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.6}


In [6]:
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.1,
    max_depth=11,
    n_estimators=500,
    subsample=0.6,
    colsample_bytree=0.6,
    gamma=0.1
)

# Modeli tüm eğitim verisi üzerinde eğitme
xgb_model.fit(X, y)

In [7]:
# Tahmin yapma
y_pred = xgb_model.predict(X)


In [11]:
mse = mean_squared_error(y, y_pred)
rmse = mse ** 0.5
print(f'Final RMSE: {rmse:.2f}')

r2 = r2_score(y, y_pred)
print(f'R² Score: {r2:.2f}')


Final RMSE: 4.90
R² Score: 1.00
