In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import GridSearchCV

In [15]:
df = pd.read_csv('cleaned_df.csv')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109880 entries, 0 to 109879
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   order_id                  109880 non-null  object 
 1   customer_unique_id        109880 non-null  object 
 2   product_id                109880 non-null  object 
 3   seller_id                 109880 non-null  object 
 4   order_date                109880 non-null  object 
 5   price                     109880 non-null  float64
 6   freight_value             109880 non-null  float64
 7   customer_zip_code_prefix  109880 non-null  int64  
 8   customer_city             109880 non-null  object 
 9   customer_state            109880 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 8.4+ MB
None


Unnamed: 0,order_id,customer_unique_id,product_id,seller_id,order_date,price,freight_value,customer_zip_code_prefix,customer_city,customer_state
0,e481f51cbdc54678b7cc49136f2d6af7,7c396fd4830fd04220f754e42b4e5bff,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-02 10:56:33,29.99,8.72,3149,sao paulo,SP
1,53cdb2fc8bc7dce0b6741e2150273451,af07308b275d755c9edb36a90c618231,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,2018-07-24 20:41:37,118.7,22.76,47813,barreiras,BA
2,47770eb9100c2d0c44946d9cf07ec65d,3a653a41f6f9fc3d2a113cf8398680e8,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,2018-08-08 08:38:49,159.9,19.22,75265,vianopolis,GO
3,949d5b44dbf5de918fe9c16f97b45f8a,7c142cf63193a1473d2e66489a9ae977,d0b61bfb1de832b15ba9d266ca96e5b0,66922902710d126a0e7d26b0e3805106,2017-11-18 19:28:06,45.0,27.2,59296,sao goncalo do amarante,RN
4,ad21c59c0840e6cb83a9ceb5573f8159,72632f0f9dd73dfee390c9b22eb56dd6,65266b2da20d04dbe00c5c2d3bb7859e,2c9e548be18521d1c43cde1c582c6de8,2018-02-13 21:18:39,19.9,8.72,9195,santo andre,SP


In [16]:
mapper = {
    'order_date': 'first',
    'price': ['sum', 'mean'],
    'freight_value': ['sum', 'mean'],
    'customer_state': 'first'
}
df['order_date'] = pd.to_datetime(df['order_date']).dt.to_period('D')
unique_invoice = df.groupby(['order_id', 'customer_unique_id']).agg(mapper).reset_index()
unique_invoice.columns = pd.Index([f'{e[0]}_' + e[1] if (e[1] != '') and (e[1] != 'first') else e[0] for e in unique_invoice.columns.tolist()])

print(unique_invoice.info())
unique_invoice.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96211 entries, 0 to 96210
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype    
---  ------              --------------  -----    
 0   order_id            96211 non-null  object   
 1   customer_unique_id  96211 non-null  object   
 2   order_date          96211 non-null  period[D]
 3   price_sum           96211 non-null  float64  
 4   price_mean          96211 non-null  float64  
 5   freight_value_sum   96211 non-null  float64  
 6   freight_value_mean  96211 non-null  float64  
 7   customer_state      96211 non-null  object   
dtypes: float64(4), object(3), period[D](1)
memory usage: 5.9+ MB
None


Unnamed: 0,order_id,customer_unique_id,order_date,price_sum,price_mean,freight_value_sum,freight_value_mean,customer_state
0,00010242fe8c5a6d1ba2dd792cb16214,871766c5855e863f6eccc05f988b23cb,2017-09-13,58.9,58.9,13.29,13.29,RJ
1,00018f77f2f0320c557190d7a144bdd3,eb28e67c4c0b83846050ddfb8a35d051,2017-04-26,239.9,239.9,19.93,19.93,SP
2,000229ec398224ef6ca0657da4fc703e,3818d81c6709e39d06b2738a8d3a2474,2018-01-14,199.0,199.0,17.87,17.87,MG
3,00024acbcdf0a6daa1e931b038114c75,af861d436cfc08b2c2ddefd0ba074622,2018-08-08,12.99,12.99,12.79,12.79,SP
4,00042b26cf59d7ce69dfabb4e55b4fd9,64b576fb70d441e8f1b2d7d446e483c5,2017-02-04,199.9,199.9,18.14,18.14,SP


In [17]:
unique_invoice['time_diff'] = unique_invoice.groupby('customer_unique_id')['order_date'].diff()

unique_invoice['does_purchase_2_times'] = unique_invoice['time_diff'].apply(lambda x: 1 if x <= pd.Timedelta(days=30) else 0)
unique_invoice = unique_invoice.drop(['time_diff'], axis=1)

print(unique_invoice.info())
unique_invoice.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96211 entries, 0 to 96210
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype    
---  ------                 --------------  -----    
 0   order_id               96211 non-null  object   
 1   customer_unique_id     96211 non-null  object   
 2   order_date             96211 non-null  period[D]
 3   price_sum              96211 non-null  float64  
 4   price_mean             96211 non-null  float64  
 5   freight_value_sum      96211 non-null  float64  
 6   freight_value_mean     96211 non-null  float64  
 7   customer_state         96211 non-null  object   
 8   does_purchase_2_times  96211 non-null  int64    
dtypes: float64(4), int64(1), object(3), period[D](1)
memory usage: 6.6+ MB
None


Unnamed: 0,order_id,customer_unique_id,order_date,price_sum,price_mean,freight_value_sum,freight_value_mean,customer_state,does_purchase_2_times
0,00010242fe8c5a6d1ba2dd792cb16214,871766c5855e863f6eccc05f988b23cb,2017-09-13,58.9,58.9,13.29,13.29,RJ,0
1,00018f77f2f0320c557190d7a144bdd3,eb28e67c4c0b83846050ddfb8a35d051,2017-04-26,239.9,239.9,19.93,19.93,SP,0
2,000229ec398224ef6ca0657da4fc703e,3818d81c6709e39d06b2738a8d3a2474,2018-01-14,199.0,199.0,17.87,17.87,MG,0
3,00024acbcdf0a6daa1e931b038114c75,af861d436cfc08b2c2ddefd0ba074622,2018-08-08,12.99,12.99,12.79,12.79,SP,0
4,00042b26cf59d7ce69dfabb4e55b4fd9,64b576fb70d441e8f1b2d7d446e483c5,2017-02-04,199.9,199.9,18.14,18.14,SP,0


In [18]:
# Order reviews dataset
df_reviews = pd.read_csv('data/olist_order_reviews_dataset.csv')
print(df_reviews.info())

df_reviews.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB
None


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [19]:
df_reviews['review_creation_date'] = pd.to_datetime(df_reviews['review_creation_date'])
df_reviews['review_answer_timestamp'] = pd.to_datetime(df_reviews['review_answer_timestamp'])
df_reviews['diff_response'] = df_reviews['review_answer_timestamp'] - df_reviews['review_creation_date']
df_reviews['diff_response_hr'] = round(df_reviews['diff_response'].dt.total_seconds() / 3600, 0)

df_reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,diff_response,diff_response_hr
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59,0 days 21:46:59,22.0
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13,1 days 03:05:13,27.0
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17,2018-02-18 14:36:24,1 days 14:36:24,39.0
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06,0 days 22:02:06,22.0
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01,2018-03-02 10:26:53,1 days 10:26:53,34.0


In [20]:
mapper = {
    'review_score': 'mean',
    'diff_response_hr': 'mean'
}

reviews_groupby = df_reviews.groupby('order_id').agg(mapper).reset_index()

print(reviews_groupby.info())
reviews_groupby.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98673 entries, 0 to 98672
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   order_id          98673 non-null  object 
 1   review_score      98673 non-null  float64
 2   diff_response_hr  98673 non-null  float64
dtypes: float64(2), object(1)
memory usage: 2.3+ MB
None


Unnamed: 0,order_id,review_score,diff_response_hr
0,00010242fe8c5a6d1ba2dd792cb16214,5.0,35.0
1,00018f77f2f0320c557190d7a144bdd3,4.0,60.0
2,000229ec398224ef6ca0657da4fc703e,5.0,16.0
3,00024acbcdf0a6daa1e931b038114c75,4.0,17.0
4,00042b26cf59d7ce69dfabb4e55b4fd9,5.0,35.0


In [21]:
# Order payments dataset
df_payments = pd.read_csv("data/olist_order_payments_dataset.csv")

print(df_payments.info())
df_payments.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB
None


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [22]:
def mode(x):
    return x.mode()[0]
mapper = {
    'payment_type': mode,
    'payment_value': ['mean', 'sum'],
    'payment_sequential': 'mean',
    'payment_installments': ['mean', 'sum']
}

payment_groupby = df_payments.groupby('order_id').agg(mapper).reset_index()
payment_groupby.columns = pd.Index([f'{e[0]}_' + e[1] if (e[1] != '') and (e[1] != 'first') else e[0] for e in payment_groupby.columns.tolist()])
print(payment_groupby.info())
payment_groupby.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99440 entries, 0 to 99439
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   order_id                   99440 non-null  object 
 1   payment_type_mode          99440 non-null  object 
 2   payment_value_mean         99440 non-null  float64
 3   payment_value_sum          99440 non-null  float64
 4   payment_sequential_mean    99440 non-null  float64
 5   payment_installments_mean  99440 non-null  float64
 6   payment_installments_sum   99440 non-null  int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 5.3+ MB
None


Unnamed: 0,order_id,payment_type_mode,payment_value_mean,payment_value_sum,payment_sequential_mean,payment_installments_mean,payment_installments_sum
0,00010242fe8c5a6d1ba2dd792cb16214,credit_card,72.19,72.19,1.0,2.0,2
1,00018f77f2f0320c557190d7a144bdd3,credit_card,259.83,259.83,1.0,3.0,3
2,000229ec398224ef6ca0657da4fc703e,credit_card,216.87,216.87,1.0,5.0,5
3,00024acbcdf0a6daa1e931b038114c75,credit_card,25.78,25.78,1.0,2.0,2
4,00042b26cf59d7ce69dfabb4e55b4fd9,credit_card,218.04,218.04,1.0,3.0,3


In [23]:
dataset = pd.merge(unique_invoice, reviews_groupby, on='order_id', how='left')
dataset = pd.merge(dataset, payment_groupby, on='order_id', how='left')

print(dataset.info())
dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96211 entries, 0 to 96210
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype    
---  ------                     --------------  -----    
 0   order_id                   96211 non-null  object   
 1   customer_unique_id         96211 non-null  object   
 2   order_date                 96211 non-null  period[D]
 3   price_sum                  96211 non-null  float64  
 4   price_mean                 96211 non-null  float64  
 5   freight_value_sum          96211 non-null  float64  
 6   freight_value_mean         96211 non-null  float64  
 7   customer_state             96211 non-null  object   
 8   does_purchase_2_times      96211 non-null  int64    
 9   review_score               95568 non-null  float64  
 10  diff_response_hr           95568 non-null  float64  
 11  payment_type_mode          96211 non-null  object   
 12  payment_value_mean         96211 non-null  float64  
 13  payment_value_su

Unnamed: 0,order_id,customer_unique_id,order_date,price_sum,price_mean,freight_value_sum,freight_value_mean,customer_state,does_purchase_2_times,review_score,diff_response_hr,payment_type_mode,payment_value_mean,payment_value_sum,payment_sequential_mean,payment_installments_mean,payment_installments_sum
0,00010242fe8c5a6d1ba2dd792cb16214,871766c5855e863f6eccc05f988b23cb,2017-09-13,58.9,58.9,13.29,13.29,RJ,0,5.0,35.0,credit_card,72.19,72.19,1.0,2.0,2
1,00018f77f2f0320c557190d7a144bdd3,eb28e67c4c0b83846050ddfb8a35d051,2017-04-26,239.9,239.9,19.93,19.93,SP,0,4.0,60.0,credit_card,259.83,259.83,1.0,3.0,3
2,000229ec398224ef6ca0657da4fc703e,3818d81c6709e39d06b2738a8d3a2474,2018-01-14,199.0,199.0,17.87,17.87,MG,0,5.0,16.0,credit_card,216.87,216.87,1.0,5.0,5
3,00024acbcdf0a6daa1e931b038114c75,af861d436cfc08b2c2ddefd0ba074622,2018-08-08,12.99,12.99,12.79,12.79,SP,0,4.0,17.0,credit_card,25.78,25.78,1.0,2.0,2
4,00042b26cf59d7ce69dfabb4e55b4fd9,64b576fb70d441e8f1b2d7d446e483c5,2017-02-04,199.9,199.9,18.14,18.14,SP,0,5.0,35.0,credit_card,218.04,218.04,1.0,3.0,3


In [24]:
dataset = dataset.fillna(0)

print(dataset.info())
dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96211 entries, 0 to 96210
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype    
---  ------                     --------------  -----    
 0   order_id                   96211 non-null  object   
 1   customer_unique_id         96211 non-null  object   
 2   order_date                 96211 non-null  period[D]
 3   price_sum                  96211 non-null  float64  
 4   price_mean                 96211 non-null  float64  
 5   freight_value_sum          96211 non-null  float64  
 6   freight_value_mean         96211 non-null  float64  
 7   customer_state             96211 non-null  object   
 8   does_purchase_2_times      96211 non-null  int64    
 9   review_score               96211 non-null  float64  
 10  diff_response_hr           96211 non-null  float64  
 11  payment_type_mode          96211 non-null  object   
 12  payment_value_mean         96211 non-null  float64  
 13  payment_value_su

Unnamed: 0,order_id,customer_unique_id,order_date,price_sum,price_mean,freight_value_sum,freight_value_mean,customer_state,does_purchase_2_times,review_score,diff_response_hr,payment_type_mode,payment_value_mean,payment_value_sum,payment_sequential_mean,payment_installments_mean,payment_installments_sum
0,00010242fe8c5a6d1ba2dd792cb16214,871766c5855e863f6eccc05f988b23cb,2017-09-13,58.9,58.9,13.29,13.29,RJ,0,5.0,35.0,credit_card,72.19,72.19,1.0,2.0,2
1,00018f77f2f0320c557190d7a144bdd3,eb28e67c4c0b83846050ddfb8a35d051,2017-04-26,239.9,239.9,19.93,19.93,SP,0,4.0,60.0,credit_card,259.83,259.83,1.0,3.0,3
2,000229ec398224ef6ca0657da4fc703e,3818d81c6709e39d06b2738a8d3a2474,2018-01-14,199.0,199.0,17.87,17.87,MG,0,5.0,16.0,credit_card,216.87,216.87,1.0,5.0,5
3,00024acbcdf0a6daa1e931b038114c75,af861d436cfc08b2c2ddefd0ba074622,2018-08-08,12.99,12.99,12.79,12.79,SP,0,4.0,17.0,credit_card,25.78,25.78,1.0,2.0,2
4,00042b26cf59d7ce69dfabb4e55b4fd9,64b576fb70d441e8f1b2d7d446e483c5,2017-02-04,199.9,199.9,18.14,18.14,SP,0,5.0,35.0,credit_card,218.04,218.04,1.0,3.0,3


In [25]:
# Time splitting 
n_days = 90
max_date = dataset['order_date'].max()
cutoff = max_date - pd.to_timedelta(n_days, unit='d')

temp_in = dataset[dataset['order_date'] <= cutoff]
temp_out = dataset[dataset['order_date'] > cutoff]

In [34]:
print(temp_out.info())
temp_out.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18609 entries, 3 to 96210
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype    
---  ------                     --------------  -----    
 0   order_id                   18609 non-null  object   
 1   customer_unique_id         18609 non-null  object   
 2   order_date                 18609 non-null  period[D]
 3   price_sum                  18609 non-null  float64  
 4   price_mean                 18609 non-null  float64  
 5   freight_value_sum          18609 non-null  float64  
 6   freight_value_mean         18609 non-null  float64  
 7   customer_state             18609 non-null  object   
 8   does_purchase_2_times      18609 non-null  int64    
 9   review_score               18609 non-null  float64  
 10  diff_response_hr           18609 non-null  float64  
 11  payment_type_mode          18609 non-null  object   
 12  payment_value_mean         18609 non-null  float64  
 13  payment_value_su

Unnamed: 0,order_id,customer_unique_id,order_date,price_sum,price_mean,freight_value_sum,freight_value_mean,customer_state,does_purchase_2_times,review_score,diff_response_hr,payment_type_mode,payment_value_mean,payment_value_sum,payment_sequential_mean,payment_installments_mean,payment_installments_sum
3,00024acbcdf0a6daa1e931b038114c75,af861d436cfc08b2c2ddefd0ba074622,2018-08-08,12.99,12.99,12.79,12.79,SP,0,4.0,17.0,credit_card,25.78,25.78,1.0,2.0,2
7,000576fe39319847cbb9d288c5617fa6,fda4476abb6307ab3c415b7e6d026526,2018-07-04,810.0,810.0,70.75,70.75,SP,0,5.0,44.0,credit_card,880.75,880.75,1.0,10.0,10
9,0005f50442cb953dcd1d21e1fb923495,0782c41380992a5a533489063df0eef6,2018-07-02,53.99,53.99,11.4,11.4,SP,0,4.0,23.0,credit_card,65.39,65.39,1.0,1.0,1
11,00063b381e2406b52ad429470734ebd5,3fb97204945ca0c01bcf3eee6031c5f1,2018-07-27,45.0,45.0,12.98,12.98,SP,0,5.0,24.0,credit_card,57.98,57.98,1.0,5.0,5
12,0006ec9db01a64e59a68b2c340bf65a7,7ed0ea20347f67fe61d1c99fdf8556ae,2018-07-24,74.0,74.0,23.32,23.32,RJ,0,5.0,47.0,credit_card,97.32,97.32,1.0,4.0,4


In [33]:
# Feature engineering

targets = temp_out\
            [['customer_unique_id', 'price_sum']]\
            .groupby('customer_unique_id')\
            .agg({'price_sum': sum})\
            .rename({'price_sum': 'spend_90_total'}, axis=1)\
            .assign(spend_90_flag = 1)

targets

Unnamed: 0_level_0,spend_90_total,spend_90_flag
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000e309254ab1fc5ba99dd469d36bdb4,59.90,1
000ec5bff359e1c0ad76a81a45cb598f,14.96,1
000fbf0473c10fc1ab6f8d2d286ce20c,285.80,1
0015752e079902b12cd00b9b7596276b,59.80,1
00172711b30d52eea8b313a7f2cced02,74.50,1
...,...,...
ffee94d548cef05b146d825a7648dab4,27.90,1
fff22793223fe80c97a8fd02ac5c6295,66.00,1
fff3e1d7bc75f11dc7670619b2e61840,54.85,1
fff5eb4918b2bf4b2da476788d42051c,1050.00,1


In [37]:
data = temp_out.merge(targets,
                  left_index=True,
                  right_index=True,
                  how='left')\
            .fillna(0)

data['spend_90_flag'].value_counts()

0.0    18609
Name: spend_90_flag, dtype: int64

In [None]:
# # Encoding category
# from sklearn.preprocessing import OneHotEncoder

# encoder_state = OneHotEncoder(drop='first', sparse_output=False)
# encoder_payment_type = OneHotEncoder(sparse_output=False)
# # transform data
# onehot_state = encoder_state.fit_transform(dataset['customer_state'].values.reshape(-1, 1))
# column_state = encoder_state.get_feature_names_out(['customer_state'])

# onehot_payment_type = encoder_payment_type.fit_transform(dataset['payment_type_mode'].values.reshape(-1, 1))
# column_payment_type = encoder_payment_type.get_feature_names_out(['payment_type_mode'])

# # print("Encoded data:\n", onehot)
# # print("Column names:", column_names)

# encoded_state = pd.DataFrame(onehot_state, columns=column_state)
# encoded_payment_type = pd.DataFrame(onehot_payment_type, columns=column_payment_type)

# dataset = pd.concat([dataset, encoded_state, encoded_payment_type], axis=1)

# print(dataset.info())
# dataset.head()