In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [50]:
df=pd.read_csv('shop_smart_ecommerce.csv')

In [51]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [52]:
x=df.drop('Revenue',axis=1)
y=df['Revenue'].astype(int)

In [53]:
num_cols=x.select_dtypes(include=['int64','float64']).columns
cat_cols=x.select_dtypes(include=['object','bool']).columns

In [54]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [55]:

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2,stratify=y)

In [58]:
clf=ColumnTransformer([
    ('scale',StandardScaler(),num_cols),
    ('ohe',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'),cat_cols)
],remainder='passthrough')

In [82]:
model=DecisionTreeClassifier(max_depth=4,min_samples_leaf=25,class_weight='balanced',
                            random_state=42)

In [83]:
pipe=Pipeline(steps=[
    ('trf',clf),
    ('dt',model)
])

In [84]:
pipe.fit(x_train,y_train)

In [85]:

y_pred=pipe.predict(x_test)
y_pred_x_train=pipe.predict(x_train)
print('accuracy',accuracy_score(y_test,y_pred))
print('accuracy',accuracy_score(y_train,y_pred_x_train))
print('f1_score',f1_score(y_test,y_pred))
print('f1_score',f1_score(y_train,y_pred_x_train))

accuracy 0.8329278183292782
accuracy 0.8337388483373885
f1_score 0.6341030195381883
f1_score 0.620194534506716


HYPERPARAMETER TUNING

In [86]:
from sklearn.model_selection import GridSearchCV

In [87]:
param_grid={
    'dt__max_depth':[4,5,6,8],
    'dt__min_samples_leaf':[25,30,35],

}

grid=GridSearchCV(
    pipe,
    param_grid,
    cv=5,
    scoring='f1'
)

grid.fit(x_train, y_train)


print("Best F1:", grid.best_score_)
print("Best params:", grid.best_params_)

Best F1: 0.6249923454355131
Best params: {'dt__max_depth': 4, 'dt__min_samples_leaf': 25}
