<a href="https://colab.research.google.com/github/ZerXXX0/sales-prediction/blob/main/MLQ_Catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# Load data
df_train = pd.read_csv("train_final.csv")
df_test = pd.read_csv("test_final.csv")

# Kolom yang tidak dipakai
drop_cols = ['Unnamed: 0', 'TransactionID', 'MemberID']

# Siapkan fitur dan label
X = df_train.drop(columns=drop_cols + ['next_buy'])
y = df_train['next_buy']

# Isi missing values
X.fillna(X.median(), inplace=True)
df_test_clean = df_test.drop(columns=drop_cols)
df_test_clean.fillna(X.median(), inplace=True)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Inisialisasi dan training CatBoost
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    auto_class_weights='Balanced',
    random_seed=42,
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

# Evaluasi
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]

print("ROC AUC:", roc_auc_score(y_val, y_prob))
print(classification_report(y_val, y_pred))

# Prediksi pada test data
test_predictions = model.predict(df_test_clean)



0:	test: 0.8189872	best: 0.8189872 (0)	total: 77.5ms	remaining: 1m 17s
100:	test: 0.8643446	best: 0.8643446 (100)	total: 2.9s	remaining: 25.8s
200:	test: 0.8725364	best: 0.8725364 (200)	total: 5.75s	remaining: 22.8s
300:	test: 0.8822213	best: 0.8822213 (300)	total: 8.64s	remaining: 20.1s
400:	test: 0.8921685	best: 0.8921685 (400)	total: 13.3s	remaining: 19.8s
500:	test: 0.8998191	best: 0.8998191 (500)	total: 16.2s	remaining: 16.1s
600:	test: 0.9071610	best: 0.9071610 (600)	total: 19s	remaining: 12.6s
700:	test: 0.9127549	best: 0.9127549 (700)	total: 21.9s	remaining: 9.34s
800:	test: 0.9177076	best: 0.9177076 (800)	total: 26.5s	remaining: 6.59s
900:	test: 0.9218838	best: 0.9218838 (900)	total: 29.4s	remaining: 3.23s
999:	test: 0.9256168	best: 0.9256168 (999)	total: 32.2s	remaining: 0us

bestTest = 0.9256168218
bestIteration = 999

ROC AUC: 0.9256168218025295
              precision    recall  f1-score   support

           0       0.98      0.78      0.87     21512
           1       0.

In [3]:
df_test['next_buy_predicted'] = test_predictions
df_test[['TransactionID', 'next_buy_predicted']].to_csv("prediksi_next_buy.csv", index=False)


In [5]:
df_test.to_csv("testResult.csv")

In [6]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,TransactionID,MemberID,Qty,PricePerUnit,NoOfChild,YoungestKidDOB_encoded,City_encoded,Source_encoded,FK_PRODUCT_ID_encoded,FK_PROD_GRAM_ID_encoded,next_buy_predicted
0,0,5f57263c9b0b2b3e3d7c404510dd59060999115a,c2a630e3d0dc77dac0f63424a9ae1438,1,165300.0,1,6759,0.133523,0.135032,0.016952,0.016976,0
1,1,b0c438ae346e794fef83c63a9ce17d6df73da233,3ecf7484c08418953e967a20de37051b,1,165300.0,1,6929,0.022044,0.135032,0.016952,0.016976,0
2,2,7e2498fc518f0b1489982848a29dc2677ec31367,97bbd6c99a862f20657d9b2b1c77b2c8,1,165300.0,2,6925,0.173841,0.135032,0.016952,0.016976,0
3,3,aaa52daf4ca3b8befcbe01362d8219f67293bf45,3ce072ff9c6f2f4b7c95dbc08324a24d,1,,1,6377,0.130226,0.205367,0.010196,0.010148,0
4,4,4ec62459fb8d83062284e32f3c7b5af125d242b1,ab0b0de2a1c85a40b5d58644aef745c0,2,85500.0,2,5612,0.192486,0.135032,0.010196,0.010148,0


In [8]:
# prompt: drop semua kolom kecuali memberID dan next_buy

# Siapkan fitur dan label
X = df_train[['MemberID']]
y = df_train['next_buy_predicted']

df_test_clean = df_test[['MemberID']]

# No need to fill missing values as MemberID is likely not missing

# Train-test split is not needed for this simplified scenario

# Training the model is not relevant if only memberID and next_buy are kept

# Evaluation is not relevant

# Prediction is not relevant

# Only keep MemberID and next_buy in df_train
df_train = df_train[['MemberID', 'next_buy']]

# Keep MemberID and the original next_buy (if it exists) and the predicted one in df_test
# If 'next_buy' doesn't exist in df_test initially, this will keep MemberID and 'next_buy_predicted'
df_test = df_test[['MemberID', 'next_buy', 'next_buy_predicted']]

df_test.to_csv("testResult.csv")
df_test.head()

KeyError: 'next_buy_predicted'

In [13]:
submission_df = pd.read_csv("/content/testResult.csv")
submit_df = pd.read_csv("https://raw.githubusercontent.com/ZerXXX0/sales-prediction/refs/heads/main/dataset/sample_submission.csv")

In [14]:
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21098 entries, 0 to 21097
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0.1             21098 non-null  int64  
 1   Unnamed: 0               21098 non-null  int64  
 2   TransactionID            21098 non-null  object 
 3   MemberID                 21098 non-null  object 
 4   Qty                      21098 non-null  int64  
 5   PricePerUnit             17917 non-null  float64
 6   NoOfChild                21098 non-null  int64  
 7   YoungestKidDOB_encoded   21098 non-null  int64  
 8   City_encoded             21066 non-null  float64
 9   Source_encoded           20914 non-null  float64
 10  FK_PRODUCT_ID_encoded    21098 non-null  float64
 11  FK_PROD_GRAM_ID_encoded  21098 non-null  float64
 12  next_buy_predicted       21098 non-null  int64  
dtypes: float64(5), int64(6), object(2)
memory usage: 2.1+ MB


In [15]:
# prompt: drop semua kolom kecuali memberid dan next)buy_predicted dan rename next_buy_pedicted menjadi next_buy

# Keep only 'MemberID' and 'next_buy_predicted'
submission_df = submission_df[['MemberID', 'next_buy_predicted']]

# Rename 'next_buy_predicted' to 'next_buy'
submission_df = submission_df.rename(columns={'next_buy_predicted': 'next_buy'})

submission_df.info()
print(submission_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21098 entries, 0 to 21097
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   MemberID  21098 non-null  object
 1   next_buy  21098 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 329.8+ KB
                           MemberID  next_buy
0  c2a630e3d0dc77dac0f63424a9ae1438         0
1  3ecf7484c08418953e967a20de37051b         0
2  97bbd6c99a862f20657d9b2b1c77b2c8         0
3  3ce072ff9c6f2f4b7c95dbc08324a24d         0
4  ab0b0de2a1c85a40b5d58644aef745c0         0


In [17]:
submit_df = pd.read_csv("https://raw.githubusercontent.com/ZerXXX0/sales-prediction/refs/heads/main/dataset/sample_submission.csv")

# First, create a new dataframe from submission_df with duplicate MemberIDs removed.
# We keep the 'last' entry for each member.
submission_df_unique = submission_df.drop_duplicates(subset=['MemberID'], keep='last')

# Now, create the lookup map from this de-duplicated dataframe.
# This will succeed because the 'MemberID' index is now unique.
next_buy_lookup = submission_df_unique.set_index('MemberID')['next_buy']

# Proceed with the map operation as before. This will now work correctly.
submit_df['next_buy'] = submit_df['MemberID'].map(next_buy_lookup)

In [18]:


submit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6381 entries, 0 to 6380
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   MemberID  6381 non-null   object
 1   next_buy  6381 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 99.8+ KB


In [19]:
submit_df.to_csv("submission.csv")

In [20]:
import pandas as pd

# Load file
df = pd.read_csv("submission.csv")

# Hapus kolom 'Unnamed: 0'
df = df.drop(columns=['Unnamed: 0'])

# Simpan kembali
df.to_csv("submission_cleaned.csv")
