<a href="https://colab.research.google.com/github/ZerXXX0/sales-prediction/blob/main/MLQ_LGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# Load data
train = pd.read_csv('https://raw.githubusercontent.com/ZerXXX0/sales-prediction/refs/heads/main/dataset/train_final.csv')
test = pd.read_csv('https://raw.githubusercontent.com/ZerXXX0/sales-prediction/refs/heads/main/dataset/test_final.csv')

# Drop non-numeric and unnecessary columns
drop_cols = ['Unnamed: 0', 'TransactionID', 'MemberID']
X = train.drop(columns=drop_cols + ['next_buy'])
y = train['next_buy']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

y_train = 1 - y_train
y_val = 1 - y_val

In [48]:
# Train model
lgbm = lgb.LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)

# Validation
y_pred = lgbm.predict(X_val)
print(classification_report(y_val, y_pred))



[LightGBM] [Info] Number of positive: 86060, number of negative: 18623
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 104683, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.822101 -> initscore=1.530647
[LightGBM] [Info] Start training from score 1.530647
              precision    recall  f1-score   support

           0       0.80      0.37      0.51      4671
           1       0.88      0.98      0.93     21500

    accuracy                           0.87     26171
   macro avg       0.84      0.68      0.72     26171
weighted avg       0.86      0.87      0.85     26171





In [61]:
import joblib

# Save the model
joblib.dump(lgbm, 'lgbm_model_best.pkl')

['lgbm_model_best.pkl']

In [49]:
X_test = test.drop(columns=drop_cols)
X_test_imputed = imputer.transform(X_test.values)
test_preds = lgbm.predict(X_test_imputed)

# Tambahkan ke dataframe
test['next_buy'] = test_preds



In [50]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21098 entries, 0 to 21097
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               21098 non-null  int64  
 1   TransactionID            21098 non-null  object 
 2   MemberID                 21098 non-null  object 
 3   Qty                      21098 non-null  int64  
 4   PricePerUnit             17917 non-null  float64
 5   NoOfChild                21098 non-null  int64  
 6   YoungestKidDOB_encoded   21098 non-null  int64  
 7   City_encoded             21066 non-null  float64
 8   Source_encoded           20914 non-null  float64
 9   FK_PRODUCT_ID_encoded    21098 non-null  float64
 10  FK_PROD_GRAM_ID_encoded  21098 non-null  float64
 11  next_buy                 21098 non-null  int64  
dtypes: float64(5), int64(5), object(2)
memory usage: 1.9+ MB


In [51]:
# prompt: drop columns

# Assuming 'test' is your test dataframe from the previous cell
# Drop the 'Unnamed: 0' and 'MemberID' columns
test = test.drop(columns=['Unnamed: 0', 'TransactionID', 'Qty', 'PricePerUnit', 'NoOfChild', 'YoungestKidDOB_encoded', 'City_encoded', 'Source_encoded', 'FK_PRODUCT_ID_encoded', 'FK_PROD_GRAM_ID_encoded'])

# Display information about the updated test dataframe
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21098 entries, 0 to 21097
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   MemberID  21098 non-null  object
 1   next_buy  21098 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 329.8+ KB


In [52]:
test.to_csv("test_final.csv")

In [53]:
submission_sample = pd.read_csv('https://raw.githubusercontent.com/ZerXXX0/sales-prediction/refs/heads/main/dataset/sample_submission.csv')

In [54]:
submission_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6381 entries, 0 to 6380
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MemberID  6381 non-null   object 
 1   next_buy  0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 99.8+ KB


In [55]:
# First, create a new dataframe from submission_df with duplicate MemberIDs removed.
# We keep the 'last' entry for each member.
submission_df_unique = test.drop_duplicates(subset=['MemberID'], keep='last')

# Now, create the lookup map from this de-duplicated dataframe.
# This will succeed because the 'MemberID' index is now unique.
next_buy_lookup = submission_df_unique.set_index('MemberID')['next_buy']

# Proceed with the map operation as before. This will now work correctly.
submission_sample['next_buy'] = submission_sample['MemberID'].map(next_buy_lookup)

In [56]:
submission_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6381 entries, 0 to 6380
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   MemberID  6381 non-null   object
 1   next_buy  6381 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 99.8+ KB


In [57]:
submission_sample.head()

Unnamed: 0,MemberID,next_buy
0,c2a630e3d0dc77dac0f63424a9ae1438,1
1,3ecf7484c08418953e967a20de37051b,1
2,97bbd6c99a862f20657d9b2b1c77b2c8,1
3,3ce072ff9c6f2f4b7c95dbc08324a24d,1
4,ab0b0de2a1c85a40b5d58644aef745c0,0


In [58]:
submission_sample.to_csv("submit_LGBM.csv", index=False)

In [59]:
submission_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6381 entries, 0 to 6380
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   MemberID  6381 non-null   object
 1   next_buy  6381 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 99.8+ KB


In [60]:
submission_sample.head()

Unnamed: 0,MemberID,next_buy
0,c2a630e3d0dc77dac0f63424a9ae1438,1
1,3ecf7484c08418953e967a20de37051b,1
2,97bbd6c99a862f20657d9b2b1c77b2c8,1
3,3ce072ff9c6f2f4b7c95dbc08324a24d,1
4,ab0b0de2a1c85a40b5d58644aef745c0,0
