In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

file_path = "month5_cleaned.xlsx"
df = pd.read_excel(file_path)

print("Dataset loaded successfully. Here are the first few rows:")
print(df.head())

Dataset loaded successfully. Here are the first few rows:
                  Kargo Firmasi      Siparis Tarihi          Alici  \
0  Trendyol Express Marketplace 2024-05-01 00:00:00  DAMLA ALTUNOK   
1  Trendyol Express Marketplace 2024-05-01 00:00:00  DAMLA ALTUNOK   
2  Trendyol Express Marketplace 2024-05-01 00:02:00     Gonul Erdi   
3  Trendyol Express Marketplace 2024-05-01 00:02:00    cihan cetin   
4  Trendyol Express Marketplace 2024-05-01 00:03:00     Esra Artan   

           il      ilce                                           Urun Adi  \
0    Istanbul   Avcilar  7 Metre Tel Yaprak Serit Yapay Tac Sarmasik Ci...   
1    Istanbul   Avcilar  230 Cm Large Yaprak Yapay Hedera Helix Sarmasi...   
2       Izmir   Torbali  7 Metre Tel Yaprak Serit Yapay Tac Sarmasik Ci...   
3  Kirklareli  Babaeski  Dekoratif Yesil Yaprakli Yapay Sarmasik Ve 3 M...   
4      Manisa   Salihli  7 Metre Tel Yaprak Serit Yapay Tac Sarmasik Ci...   

           Marka  Adet  Birim Fiyat  Satis Tutari Mü

In [8]:
print("Checking for missing values:")
print(df.isnull().sum())

df.dropna(inplace=True)

print("\nChecking data types:")
print(df.dtypes)

#'Cinsiyet' column (Male = 0, Female = 1)
df['Cinsiyet'] = df['Cinsiyet'].map({'Belirtilmemis': 0, 'Erkek': 1, 'Kadin': 2})

print("\nData after preprocessing:")
print(df.head())

Checking for missing values:
Kargo Firmasi                0
Siparis Tarihi               0
Alici                        0
il                           0
ilce                         0
Urun Adi                     0
Marka                        0
Adet                         0
Birim Fiyat                  0
Satis Tutari                 0
Müsteri Siparis Adedi        0
Yas                      11783
Cinsiyet                     0
dtype: int64

Checking data types:
Kargo Firmasi                    object
Siparis Tarihi           datetime64[ns]
Alici                            object
il                               object
ilce                             object
Urun Adi                         object
Marka                            object
Adet                              int64
Birim Fiyat                     float64
Satis Tutari                    float64
Müsteri Siparis Adedi            object
Yas                              object
Cinsiyet                         object
dtype: object

In [9]:
# Unique values in 'Cinsiyet' column
print("Unique values in 'Cinsiyet':")
print(df['Cinsiyet'].unique())

# Unique values in 'Yas' column
print("\nUnique values in 'Yas':")
print(df['Yas'].unique())


Unique values in 'Cinsiyet':
[2 1 0]

Unique values in 'Yas':
['21-30' '41-50' '71+' '0-20' '31-40' '51-60' '61-70']


In [10]:
age_mapping = {
    '0-20': 0,
    '21-30': 1,
    '31-40': 2,
    '41-50': 3,
    '51-60': 4,
    '61-70': 5,
    '71+': 6
}
df['Yas'] = df['Yas'].map(age_mapping)

print("\nData after converting 'Yas' to numerical values:")
print(df.head())


Data after converting 'Yas' to numerical values:
                  Kargo Firmasi      Siparis Tarihi            Alici  \
0  Trendyol Express Marketplace 2024-05-01 00:00:00    DAMLA ALTUNOK   
1  Trendyol Express Marketplace 2024-05-01 00:00:00    DAMLA ALTUNOK   
3  Trendyol Express Marketplace 2024-05-01 00:02:00      cihan cetin   
6  Trendyol Express Marketplace 2024-05-01 00:05:00  Sovket Adibelli   
7  Trendyol Express Marketplace 2024-05-01 00:06:00  Kubra Yildizhan   

           il         ilce                                           Urun Adi  \
0    Istanbul      Avcilar  7 Metre Tel Yaprak Serit Yapay Tac Sarmasik Ci...   
1    Istanbul      Avcilar  230 Cm Large Yaprak Yapay Hedera Helix Sarmasi...   
3  Kirklareli     Babaeski  Dekoratif Yesil Yaprakli Yapay Sarmasik Ve 3 M...   
6      Ankara      Cankaya  Dekoratif Yesil Yaprakli Yapay Sarmasik Ve 3 M...   
7    Istanbul  Zeytinburnu  7.5 Metre Tel Yaprak Serit Yapay Tac Sarmasik ...   

            Marka  Adet  Birim

Unnamed: 0,shipping_company,order_date,city,brand,company_order,customer_order_count,age,sex,age_group,age_group_0-20,age_group_21-30,age_group_31-40,age_group_41-50,age_group_51-60,age_group_61-70,age_group_71+,age_group_Unknown,sex_Female,sex_Male,sex_Not Specified
0,Trendyol Express Marketplace,2024-01-01 00:01:00,Kocaeli,Unikum,Hayır,1.Sipariş,Unknown,Not Specified,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,Trendyol Express Marketplace,2024-01-01 00:04:00,İstanbul,Çiçek,Hayır,1.Sipariş,Unknown,Not Specified,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,PTT Kargo Marketplace,2024-01-01 00:07:00,Kilis,casadilegno,Hayır,1.Sipariş,Unknown,Not Specified,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,PTT Kargo Marketplace,2024-01-01 00:07:00,Kilis,RDT Shop,Hayır,1.Sipariş,Unknown,Not Specified,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,Trendyol Express Marketplace,2024-01-01 00:14:00,İstanbul,BizimModa,Hayır,2.Sipariş,Unknown,Not Specified,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [11]:
# independent variables and target variable
X = df[['Yas', 'Cinsiyet']]
y = df['Adet']

print("\nFeatures (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())



Features (X):
   Yas  Cinsiyet
0    1         2
1    1         2
3    1         2
6    3         2
7    6         2

Target (y):
0    3
1    2
3    2
6    1
7    1
Name: Adet, dtype: int64


Unnamed: 0,age_group,sex,cluster
0,Unknown,Not Specified,2
1,Unknown,Not Specified,2
2,Unknown,Not Specified,2
3,Unknown,Not Specified,2
4,Unknown,Not Specified,2


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData split complete:")
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Test set size: {X_test.shape[0]} rows")


Data split complete:
Training set size: 10045 rows
Test set size: 2512 rows


In [13]:
# Initialize and train the Random Forest model
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

print("\nModel training complete.")


Model training complete.


In [14]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("\nModel evaluation results:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Model evaluation results:
Mean Absolute Error (MAE): 1.0493693758637808
Root Mean Squared Error (RMSE): 1.6829419441941391




In [23]:
new_data = pd.DataFrame({
    'Yas': [2],  # Example: '31-40'
    'Cinsiyet': [1]  # Example: 'Erkek'
})

# Predict the product count
predicted_count = model.predict(new_data)
print("\nPredicted product count for the new data:", predicted_count[0])


Predicted product count for the new data: 2.3377381523064504


In [41]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    random_state=42,
    n_estimators=1000,      
    learning_rate=0.05,      
    max_depth=5,           
    subsample=0.8,          
    colsample_bytree=0.8   
)

xgb_model.fit(X_train, y_train)

xgb_y_pred = xgb_model.predict(X_test)

xgb_mae = mean_absolute_error(y_test, xgb_y_pred)
xgb_rmse = mean_squared_error(y_test, xgb_y_pred, squared=False)

print("\nUpdated XGBoost model evaluation results with new hyperparameters:")
print(f"Mean Absolute Error (MAE): {xgb_mae}")
print(f"Root Mean Squared Error (RMSE): {xgb_rmse}")



Updated XGBoost model evaluation results with new hyperparameters:
Mean Absolute Error (MAE): 1.0482709230795788
Root Mean Squared Error (RMSE): 1.6821456550946459




In [29]:
comparison = pd.DataFrame({
    "Model": ["Random Forest", "XGBoost"],
    "Mean Absolute Error (MAE)": [mae, xgb_mae],
    "Root Mean Squared Error (RMSE)": [rmse, xgb_rmse]
})

print("\nModel Performance Comparison:")
print(comparison)


Model Performance Comparison:
           Model  Mean Absolute Error (MAE)  Root Mean Squared Error (RMSE)
0  Random Forest                   1.049369                        1.682942
1        XGBoost                   1.048443                        1.682312


In [44]:
# Example input for prediction
new_data = pd.DataFrame({
    'Yas': [1],  # Example: '31-40'
    'Cinsiyet': [2]  # Example: 'Erkek'
})

# Predict the product count using XGBoost
xgb_predicted_count = xgb_model.predict(new_data)
print("\nPredicted product count with XGBoost for the new data:", xgb_predicted_count[0])



Predicted product count with XGBoost for the new data: 1.7183068
