## **Import Libraries**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

## **Import Input Files**

In [2]:
from google.colab import files
uploaded = files.upload()

## **Functions For Input file Processing**

In [3]:
def order_reviews():
    olist_order_reviews_dataset = pd.read_csv("olist_order_reviews_dataset.csv")
    olist_order_reviews_dataset['review_creation_date'] = pd.to_datetime(
        olist_order_reviews_dataset['review_creation_date'], errors='coerce')
    olist_order_reviews_dataset['review_answer_timestamp'] = pd.to_datetime(
        olist_order_reviews_dataset['review_answer_timestamp'], errors='coerce')
    olist_order_reviews_dataset['review_creation_date_derived'] = olist_order_reviews_dataset['review_creation_date'].dt.date
    olist_order_reviews_dataset['review_answer_timestamp_derived'] = olist_order_reviews_dataset['review_answer_timestamp'].dt.date

    idx = olist_order_reviews_dataset.groupby('order_id')['review_creation_date'].idxmax()
    df = olist_order_reviews_dataset.loc[idx]
    df.drop_duplicates(inplace=True)
    return df

def customers():
    data = pd.read_csv("olist_customers_dataset.csv")
    data['customer_zip_code_prefix'] = data['customer_zip_code_prefix'].astype(str)
    data['customer_zip_code_prefix'] = data['customer_zip_code_prefix'].apply(lambda x: x.zfill(5) if len(x) == 4 else x)
    data.drop_duplicates(inplace=True)
    return data

def orders():
    data = pd.read_csv("olist_orders_dataset.csv")
    return data

def order_items():
    order_items_df = pd.read_csv('olist_order_items_dataset.csv')
    order_items_df.drop_duplicates(inplace=True)
    return order_items_df

def product_category():
    df = pd.read_csv("olist_products_dataset.csv")
    df_translation = pd.read_csv("product_category_name_translation.csv")
    df_updated = df.copy()
    df_updated['product_category_name'] = df_updated['product_category_name'].fillna('unknown product category')
    translation_dict = dict(zip(df_translation['product_category_name'], df_translation['product_category_name_english']))
    df_updated['product_category_name_english'] = df_updated['product_category_name'].map(translation_dict)
    df_updated['product_category_name_english'] = df_updated['product_category_name_english'].fillna('unknown product category')
    return df_updated

def payment():
    df = pd.read_csv("olist_order_payments_dataset.csv")
    return df

def seller():
    df = pd.read_csv("olist_sellers_dataset.csv")
    return df


# Read and Assign Variables to the Input Files

In [4]:
order_items_df = order_items()
customers_df = customers()
reviews_df = order_reviews()
product_df = product_category()
orders_df = orders()
payment_df = payment()
seller_df = seller()

  olist_order_reviews_dataset['review_creation_date'] = pd.to_datetime(
  olist_order_reviews_dataset['review_answer_timestamp'] = pd.to_datetime(


## **Merging Datasets**

In [5]:
df1 = pd.merge(customers_df, orders_df, on='customer_id', how='left')
df2 = pd.merge(df1, order_items_df, on='order_id', how='left')
df3 = pd.merge(df2, product_df, on='product_id', how='left')
df4 = pd.merge(df3, reviews_df, on='order_id', how='left')
df5 = pd.merge(df4, payment_df, on='order_id', how='left')
df6 = pd.merge(df5, seller_df, on='seller_id', how='left')
df6.head(25)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,...,review_answer_timestamp,review_creation_date_derived,review_answer_timestamp_derived,payment_sequential,payment_type,payment_installments,payment_value,seller_zip_code_prefix,seller_city,seller_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,00e7ee1b050b8499577073aeb2a297a1,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,...,2017-05-30 22:34:00,2017-05-26,2017-05-30,1.0,credit_card,2.0,146.87,8577.0,itaquaquecetuba,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,29150127e6685892b6eab3eec79f59c7,delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-15 17:14:59,...,NaT,,,1.0,credit_card,8.0,335.48,88303.0,itajai,SC
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,b2059ed67ce144a36e2aa97d2c9e9ad2,delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-11 14:31:00,...,2018-06-15 12:10:00,2018-06-15,2018-06-15,1.0,credit_card,7.0,157.73,8577.0,itaquaquecetuba,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,951670f92359f4fe4a63112aa7306eba,delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-27 23:22:42,...,2018-04-02 18:36:00,2018-03-29,2018-04-02,1.0,credit_card,1.0,173.3,8577.0,itaquaquecetuba,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,6b7d50bd145f6fc7f33cebabd7e49d0f,delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-07-30 15:16:00,...,2018-08-17 01:59:00,2018-08-10,2018-08-17,1.0,credit_card,8.0,252.25,14940.0,ibitinga,SP
5,879864dab9bc3047522c92c82e1212b8,4c93744516667ad3b8f1fb645a3116a4,89254,jaragua do sul,SC,5741ea1f91b5fbab2bd2dc653a5b5099,delivered,2017-09-14 18:14:31,2017-09-14 18:25:11,2017-09-18 21:27:40,...,2017-10-01 21:52:00,2017-09-29,2017-10-01,1.0,debit_card,1.0,282.21,5141.0,pirituba,SP
6,fd826e7cf63160e536e0908c76c3f441,addec96d2e059c80c30fe6871d30d177,4534,sao paulo,SP,36e694cf4cbc2a4803200c35e84abdc4,delivered,2018-02-19 14:38:35,2018-02-19 14:50:37,2018-02-20 00:03:39,...,2018-02-26 13:53:00,2018-02-21,2018-02-26,1.0,credit_card,1.0,22.77,4102.0,sao paulo,SP
7,5e274e7a0c3809e14aba7ad5aae0d407,57b2a98a409812fe9618067b6b8ebe4f,35182,timoteo,MG,1093c8304c7a003280dd34598194913d,delivered,2017-11-16 19:29:02,2017-11-16 19:55:41,2017-11-22 16:46:33,...,2017-11-28 17:14:00,2017-11-28,2017-11-28,1.0,credit_card,3.0,36.01,4142.0,sao paulo,SP
8,5adf08e34b2e993982a47070956c5c65,1175e95fb47ddff9de6b2b06188f7e0d,81560,curitiba,PR,1ebeea841c590e86a14a0d7a48e7d062,delivered,2018-01-18 12:35:44,2018-01-18 12:56:32,2018-01-18 23:25:35,...,NaT,,,1.0,debit_card,1.0,39.1,21210.0,rio de janeiro,RJ
9,4b7139f34592b3a31687243a302fa75b,9afe194fb833f79e300e37e580171f22,30575,belo horizonte,MG,7433cbcc783205509d66a5260da5b574,delivered,2018-01-08 11:22:34,2018-01-08 11:35:27,2018-01-11 01:00:40,...,2018-01-16 19:29:00,2018-01-14,2018-01-16,1.0,credit_card,1.0,122.47,80330.0,curitiba,PR


## **Information on Final Dataframe**

In [6]:
df6.shape

(118434, 42)

In [7]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118434 entries, 0 to 118433
Data columns (total 42 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   customer_id                      118434 non-null  object        
 1   customer_unique_id               118434 non-null  object        
 2   customer_zip_code_prefix         118434 non-null  object        
 3   customer_city                    118434 non-null  object        
 4   customer_state                   118434 non-null  object        
 5   order_id                         118434 non-null  object        
 6   order_status                     118434 non-null  object        
 7   order_purchase_timestamp         118434 non-null  object        
 8   order_approved_at                118258 non-null  object        
 9   order_delivered_carrier_date     116360 non-null  object        
 10  order_delivered_customer_date    115037 non-

In [8]:
df6.isnull().sum()

Unnamed: 0,0
customer_id,0
customer_unique_id,0
customer_zip_code_prefix,0
customer_city,0
customer_state,0
order_id,0
order_status,0
order_purchase_timestamp,0
order_approved_at,176
order_delivered_carrier_date,2074


In [9]:
null_df = df6.isnull().any(axis=1)
df6[null_df]

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,...,review_answer_timestamp,review_creation_date_derived,review_answer_timestamp_derived,payment_sequential,payment_type,payment_installments,payment_value,seller_zip_code_prefix,seller_city,seller_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,00e7ee1b050b8499577073aeb2a297a1,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,...,2017-05-30 22:34:00,2017-05-26,2017-05-30,1.0,credit_card,2.0,146.87,8577.0,itaquaquecetuba,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,09790,sao bernardo do campo,SP,29150127e6685892b6eab3eec79f59c7,delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-15 17:14:59,...,NaT,,,1.0,credit_card,8.0,335.48,88303.0,itajai,SC
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,01151,sao paulo,SP,b2059ed67ce144a36e2aa97d2c9e9ad2,delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-11 14:31:00,...,2018-06-15 12:10:00,2018-06-15,2018-06-15,1.0,credit_card,7.0,157.73,8577.0,itaquaquecetuba,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,08775,mogi das cruzes,SP,951670f92359f4fe4a63112aa7306eba,delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-27 23:22:42,...,2018-04-02 18:36:00,2018-03-29,2018-04-02,1.0,credit_card,1.0,173.30,8577.0,itaquaquecetuba,SP
5,879864dab9bc3047522c92c82e1212b8,4c93744516667ad3b8f1fb645a3116a4,89254,jaragua do sul,SC,5741ea1f91b5fbab2bd2dc653a5b5099,delivered,2017-09-14 18:14:31,2017-09-14 18:25:11,2017-09-18 21:27:40,...,2017-10-01 21:52:00,2017-09-29,2017-10-01,1.0,debit_card,1.0,282.21,5141.0,pirituba,SP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118428,7fe2e80252a9ea476f950ae8f85b0f8f,4b5820135d360a45552b5163835b1d89,35500,divinopolis,MG,c9aea907527f5d4f5fb44955c18a79fb,delivered,2017-12-17 23:13:41,2017-12-18 00:17:27,2017-12-20 18:52:08,...,2018-01-05 22:33:00,2018-01-05,2018-01-05,1.0,credit_card,1.0,75.88,4773.0,sao paulo,SP
118429,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,03937,sao paulo,SP,6760e20addcf0121e9d58f2f1ff14298,delivered,2018-04-07 15:48:17,2018-04-07 16:08:45,2018-04-11 02:08:36,...,2018-04-28 11:15:00,2018-04-14,2018-04-28,1.0,credit_card,6.0,88.78,17400.0,garca,SP
118430,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,06764,taboao da serra,SP,9ec0c8947d973db4f4e8dcf1fbfa8f1b,delivered,2018-04-04 08:20:22,2018-04-04 08:35:12,2018-04-05 18:42:35,...,2018-04-16 10:36:00,2018-04-12,2018-04-16,1.0,credit_card,3.0,129.06,14802.0,araraquara,SP
118432,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS,e31ec91cea1ecf97797787471f98a8c2,delivered,2017-11-03 21:08:33,2017-11-03 21:31:20,2017-11-06 18:24:41,...,2017-11-19 23:34:00,2017-11-17,2017-11-19,1.0,credit_card,2.0,711.07,14840.0,guariba,SP


In [10]:
not_null_df = ~df6.isna().any(axis=1)
df6[not_null_df]

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,...,review_answer_timestamp,review_creation_date_derived,review_answer_timestamp_derived,payment_sequential,payment_type,payment_installments,payment_value,seller_zip_code_prefix,seller_city,seller_state
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,6b7d50bd145f6fc7f33cebabd7e49d0f,delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-07-30 15:16:00,...,2018-08-17 01:59:00,2018-08-10,2018-08-17,1.0,credit_card,8.0,252.25,14940.0,ibitinga,SP
16,a7c125a0a07b75146167b7f04a7f8e98,5c2991dbd08bbf3cf410713c4de5a0b5,22750,rio de janeiro,RJ,72bab69c50432c6f94d8b50a5f84b69a,delivered,2018-08-13 23:45:05,2018-08-13 23:55:13,2018-08-15 12:11:00,...,2018-09-01 03:09:00,2018-08-18,2018-09-01,1.0,credit_card,2.0,136.93,4273.0,sao paulo,SP
29,167bd30a409e3e4127df5a9408ebd394,9c0096673baf55453a50073f12d1a37f,74310,goiania,GO,d10eafa208521dc30796e6a26099b6a6,delivered,2018-07-14 10:53:11,2018-07-14 11:05:18,2018-07-17 17:32:00,...,2018-07-26 20:09:00,2018-07-26,2018-07-26,1.0,credit_card,1.0,62.39,14940.0,ibitinga,SP
43,4c06b42fbf7b97ab10779cda5549cd1c,07d190f123147d9e89d4b922543d7948,65075,sao luis,MA,997fd4b37386d10b57d4ac8cdec011a6,delivered,2018-07-08 09:59:19,2018-07-08 10:30:15,2018-07-25 15:12:00,...,2018-08-14 11:40:00,2018-08-09,2018-08-14,1.0,credit_card,6.0,315.75,8577.0,itaquaquecetuba,SP
50,19cecb194f54e614b70d971306a9931b,d251c190ca75786e9ab937982d60d1d4,30320,belo horizonte,MG,14282bc70be9bdda515182fb1ce62af4,delivered,2018-04-18 14:18:09,2018-04-19 02:52:02,2018-04-20 00:47:44,...,2018-04-27 20:35:00,2018-04-27,2018-04-27,1.0,boleto,1.0,50.92,38408.0,uberlandia,MG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118312,16113da7eb5959b1085b146216d09bca,2268a3851575e0ab1a7578c25eab79c1,29650,santa teresa,ES,ca4ba0b98a93b132627b97a29c658637,delivered,2018-03-25 23:15:31,2018-03-25 23:30:23,2018-03-27 14:24:50,...,2018-05-04 04:59:00,2018-05-04,2018-05-04,1.0,credit_card,1.0,75.28,11704.0,praia grande,SP
118364,cb38d00c9e5573f32758d4d3dfa382ab,2dec450164b26f0f45ccbee550cd9091,05363,sao paulo,SP,5a509a6537686bc8e701a4461631e965,delivered,2018-05-21 07:28:32,2018-05-22 08:16:33,2018-05-22 13:43:00,...,2018-06-06 23:11:00,2018-06-05,2018-06-06,1.0,boleto,1.0,103.65,87114.0,sarandi,PR
118366,d2a592f7064d3ef0f34bd7a8d95e4458,54038fdafde7c49533a306f961281d9c,24350,niteroi,RJ,4f88a7f264d48920c4f0762d160212ca,delivered,2018-08-02 12:07:00,2018-08-02 13:05:18,2018-08-14 11:39:00,...,2018-08-17 08:45:00,2018-08-17,2018-08-17,1.0,credit_card,5.0,246.44,20930.0,rio de janeiro,RJ
118367,d2a592f7064d3ef0f34bd7a8d95e4458,54038fdafde7c49533a306f961281d9c,24350,niteroi,RJ,4f88a7f264d48920c4f0762d160212ca,delivered,2018-08-02 12:07:00,2018-08-02 13:05:18,2018-08-14 11:39:00,...,2018-08-17 08:45:00,2018-08-17,2018-08-17,1.0,credit_card,5.0,246.44,20930.0,rio de janeiro,RJ


In [11]:
duplicate_df = df6[df6.duplicated()]
duplicate_df.head(10)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,...,review_answer_timestamp,review_creation_date_derived,review_answer_timestamp_derived,payment_sequential,payment_type,payment_installments,payment_value,seller_zip_code_prefix,seller_city,seller_state


## **Feature Engineering**

In [12]:
df6['order_delivered_customer_date'] = pd.to_datetime(df6['order_delivered_customer_date'], errors='coerce')
df6['order_estimated_delivery_date'] = pd.to_datetime(df6['order_estimated_delivery_date'], errors='coerce')

df6['order_delivered_customer_date_derived'] = df6['order_delivered_customer_date'].dt.date
df6['order_estimated_delivery_date_derived'] = df6['order_estimated_delivery_date'].dt.date
df6['delivery_date_difference'] = (pd.to_datetime(df6['order_estimated_delivery_date_derived']) - pd.to_datetime(df6['order_delivered_customer_date_derived'])).dt.days

df6['order_delivery_date_difference'] = df6['delivery_date_difference'].apply(lambda x: '-1' if x < 0 else ('0' if x == 0 else '1'))

df6['total_purchase_count'] = df6.groupby('customer_unique_id')['order_id'].transform('count')
df6['review_score_binary'] = df6['review_score'].apply(lambda x: 1 if x >= 4 else 0)
df6['payment_installments_binary'] = df6['payment_installments'].apply(lambda x: 1 if x==0 or x==1 else 0)
df6['total_purchase_value'] = df6.groupby('order_id')['payment_value'].transform('sum')

# df6.drop_duplicates(inplace=True)
df6.head(20)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,...,seller_city,seller_state,order_delivered_customer_date_derived,order_estimated_delivery_date_derived,delivery_date_difference,order_delivery_date_difference,total_purchase_count,review_score_binary,payment_installments_binary,total_purchase_value
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,00e7ee1b050b8499577073aeb2a297a1,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,...,itaquaquecetuba,SP,2017-05-25,2017-06-05,11.0,1,1,1,0,146.87
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,29150127e6685892b6eab3eec79f59c7,delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-15 17:14:59,...,itajai,SC,2018-01-29,2018-02-06,8.0,1,1,0,0,335.48
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,b2059ed67ce144a36e2aa97d2c9e9ad2,delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-11 14:31:00,...,itaquaquecetuba,SP,2018-06-14,2018-06-13,-1.0,-1,1,1,0,157.73
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,951670f92359f4fe4a63112aa7306eba,delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-27 23:22:42,...,itaquaquecetuba,SP,2018-03-28,2018-04-10,13.0,1,1,1,1,173.3
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,6b7d50bd145f6fc7f33cebabd7e49d0f,delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-07-30 15:16:00,...,ibitinga,SP,2018-08-09,2018-08-15,6.0,1,1,1,0,252.25
5,879864dab9bc3047522c92c82e1212b8,4c93744516667ad3b8f1fb645a3116a4,89254,jaragua do sul,SC,5741ea1f91b5fbab2bd2dc653a5b5099,delivered,2017-09-14 18:14:31,2017-09-14 18:25:11,2017-09-18 21:27:40,...,pirituba,SP,2017-09-28,2017-10-04,6.0,1,2,1,1,282.21
6,fd826e7cf63160e536e0908c76c3f441,addec96d2e059c80c30fe6871d30d177,4534,sao paulo,SP,36e694cf4cbc2a4803200c35e84abdc4,delivered,2018-02-19 14:38:35,2018-02-19 14:50:37,2018-02-20 00:03:39,...,sao paulo,SP,2018-02-20,2018-03-05,13.0,1,1,1,1,22.77
7,5e274e7a0c3809e14aba7ad5aae0d407,57b2a98a409812fe9618067b6b8ebe4f,35182,timoteo,MG,1093c8304c7a003280dd34598194913d,delivered,2017-11-16 19:29:02,2017-11-16 19:55:41,2017-11-22 16:46:33,...,sao paulo,SP,2017-11-27,2017-12-08,11.0,1,1,0,0,36.01
8,5adf08e34b2e993982a47070956c5c65,1175e95fb47ddff9de6b2b06188f7e0d,81560,curitiba,PR,1ebeea841c590e86a14a0d7a48e7d062,delivered,2018-01-18 12:35:44,2018-01-18 12:56:32,2018-01-18 23:25:35,...,rio de janeiro,RJ,2018-01-26,2018-02-20,25.0,1,2,0,1,39.1
9,4b7139f34592b3a31687243a302fa75b,9afe194fb833f79e300e37e580171f22,30575,belo horizonte,MG,7433cbcc783205509d66a5260da5b574,delivered,2018-01-08 11:22:34,2018-01-08 11:35:27,2018-01-11 01:00:40,...,curitiba,PR,2018-01-13,2018-02-05,23.0,1,1,1,1,122.47


## **Selecting Necessary Features for ML Model**

In [13]:
df7 = df6[['customer_state', 'order_status', 'price', 'payment_type',
           'payment_installments_binary', 'total_purchase_value', 'total_purchase_count',
           'review_score_binary', 'delivery_date_difference']]

In [14]:
df7.isnull().sum()

Unnamed: 0,0
customer_state,0
order_status,0
price,830
payment_type,3
payment_installments_binary,0
total_purchase_value,0
total_purchase_count,0
review_score_binary,0
delivery_date_difference,3397


In [15]:
print(df7.shape)
with_null_rows = len(df7)
with_null_rows

(118434, 9)


118434

## **Dropping Rows with NULL/NA Values**

In [16]:
df7.dropna(inplace=True)
print(df7.shape)
without_null_rows = len(df7)
print("\n")
without_null_rows

(115034, 9)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df7.dropna(inplace=True)


115034

In [17]:
print(f"TOTAL ROWS DROPPED: {with_null_rows-without_null_rows}")

TOTAL ROWS DROPPED: 3400


## **Encode categorical variables**

In [18]:
categorical_columns = ["customer_state", "order_status", "payment_type"]
for col in categorical_columns:
    le = LabelEncoder()
    df7[col] = le.fit_transform(df7[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df7[col] = le.fit_transform(df7[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df7[col] = le.fit_transform(df7[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df7[col] = le.fit_transform(df7[col])


## **Splitting Data for ML Model**

In [19]:
X = df7.drop(columns=["review_score_binary"])
y = df7["review_score_binary"]

## **Standardize numerical features**

In [20]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## **SMOTE for Oversampling Low Review Score Class i.e. Level 0**

In [21]:
smote = SMOTE(random_state=1234)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

## **Splitting the Data into Train and Test**

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=1234, stratify=y_resampled)

## **Initialising ML Model**

In [23]:
# model = LogisticRegression(random_state=1234)
# model.fit(X_train, y_train)
# y_pred_logistic = model.predict(X_test)

# gb_model = GradientBoostingClassifier(random_state=1234)
# gb_model.fit(X_train, y_train)
# y_pred_gb = gb_model.predict(X_test)

rf_model = RandomForestClassifier(random_state=1234)
rf_model.fit(X_train, y_train)

y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

## **Function to Print the Output Model Observation**

In [24]:
def evaluate_model(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("Confusion Matrix:")
    print("True negatives, False positives")
    print(conf_matrix)
    print("False negatives, True positives")
    print("\n")
    print("Classification Report:\n", classification_report(y_test, y_pred))

## **Evaluate models**

In [25]:
# evaluate_model(y_test, y_pred_logistic, "Logistic Regression (with SMOTE)")
evaluate_model(y_test, y_test_pred_rf, "Random Forest Classifier (with SMOTE)")
# evaluate_model(y_test, y_pred_gb, "Gradient Boosting Classifier")

Model: Random Forest Classifier (with SMOTE)
Accuracy: 0.77
Precision: 0.75
Recall: 0.80
F1 Score: 0.77
Confusion Matrix:
True negatives, False positives
[[17425  6330]
 [ 4820 18935]]
False negatives, True positives


Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.73      0.76     23755
           1       0.75      0.80      0.77     23755

    accuracy                           0.77     47510
   macro avg       0.77      0.77      0.77     47510
weighted avg       0.77      0.77      0.77     47510

