In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
df = pd.read_csv('Ecommerce_and_Retail.csv')
df.head()

Unnamed: 0,Order_ID,Customer_ID,Product_Category,Purchase_Amount,Order_Rating,Repeat_Customer
0,ORD00001,CUST00103,Clothing,444.5,4.0,0.0
1,,CUST00436,Electronics,727.71,3.0,1.0
2,ORD00003,,Home Decor,,4.0,0.0
3,ORD00004,CUST00271,Home Decor,,5.0,1.0
4,,CUST00107,Electronics,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Order_ID          900 non-null    object 
 1   Customer_ID       900 non-null    object 
 2   Product_Category  900 non-null    object 
 3   Purchase_Amount   900 non-null    float64
 4   Order_Rating      900 non-null    float64
 5   Repeat_Customer   900 non-null    float64
dtypes: float64(3), object(3)
memory usage: 47.0+ KB


In [6]:
df.isnull().sum()

Unnamed: 0,0
Order_ID,100
Customer_ID,100
Product_Category,100
Purchase_Amount,100
Order_Rating,100
Repeat_Customer,100


In [7]:
df.drop('Order_ID', axis = 1, inplace=True)

In [8]:
mean = df['Purchase_Amount'].mean()
median = df['Purchase_Amount'].median()

print(mean)
print(median)

df['Purchase_Amount'].fillna(df['Purchase_Amount'].mean(), inplace = True)

492.43154444444446
486.71000000000004


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Purchase_Amount'].fillna(df['Purchase_Amount'].mean(), inplace = True)


In [9]:
mean = df['Order_Rating'].mean()
median = df['Order_Rating'].median()

print(mean)
print(median)

df['Order_Rating'].fillna(df['Order_Rating'].mean(), inplace = True)

2.997777777777778
3.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Order_Rating'].fillna(df['Order_Rating'].mean(), inplace = True)


In [10]:
df['Product_Category'].value_counts()

Unnamed: 0_level_0,count
Product_Category,Unnamed: 1_level_1
Electronics,197
Home Decor,187
Clothing,185
Beauty,174
Books,157


In [11]:
# # filling missing values
# # Product_Category (Electronics)

# df['Product_Category'].fillna('Electronics', inplace=True)

In [12]:
# # filling missing values
# # Product_Category (Home Decor)

# df['Product_Category'].fillna('Home Decor', inplace=True)

In [13]:
# # filling missing values
# # Product_Category (Clothing)

# df['Product_Category'].fillna('Clothing', inplace=True)

In [14]:
# # filling missing values
# # Product_Category (Beauty)

# df['Product_Category'].fillna('Beauty', inplace=True)

In [15]:
# # filling missing values
# # Product_Category (Books)

df['Product_Category'].fillna('Books', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Product_Category'].fillna('Books', inplace=True)


In [16]:
# # filling missing values
# # Customer_ID (0.0)

df['Repeat_Customer'].fillna(df['Repeat_Customer'].mode(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Repeat_Customer'].fillna(df['Repeat_Customer'].mode(), inplace=True)


In [17]:
# # filling missing values
# # Repeat_Customer (0.0)

# df['Repeat_Customer'].fillna(0, inplace=True)

In [18]:
# # filling missing values
# # Repeat_Customer (1)

df['Repeat_Customer'].fillna(1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Repeat_Customer'].fillna(1, inplace=True)


In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Product_Category'] = le.fit_transform(df['Product_Category'])
df['Customer_ID'] = le.fit_transform(df['Customer_ID'])

In [20]:
df['Repeat_Customer'] = df['Repeat_Customer'].astype(int)
df

Unnamed: 0,Customer_ID,Product_Category,Purchase_Amount,Order_Rating,Repeat_Customer
0,79,2,444.500000,4.000000,0
1,354,3,727.710000,3.000000,1
2,407,4,492.431544,4.000000,0
3,224,4,492.431544,5.000000,1
4,82,3,492.431544,2.997778,1
...,...,...,...,...,...
995,182,2,678.820000,2.997778,0
996,195,4,492.431544,1.000000,1
997,143,3,456.360000,5.000000,0
998,220,1,930.160000,4.000000,0


In [21]:
corr = df.corrwith(df['Repeat_Customer'])
corr

Unnamed: 0,0
Customer_ID,-0.033773
Product_Category,0.02343
Purchase_Amount,-0.000795
Order_Rating,-0.021971
Repeat_Customer,1.0


In [22]:
df['Repeat_Customer'].value_counts()

Unnamed: 0_level_0,count
Repeat_Customer,Unnamed: 1_level_1
0,630
1,370


In [23]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X = df.drop('Repeat_Customer', axis=1)
y = df['Repeat_Customer']
X_resampled, y_resampled = smote.fit_resample(X, y)

In [24]:
print(X_resampled.shape)
print(y_resampled.shape)

(1260, 4)
(1260,)


In [25]:
X_tarin, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [26]:
model = LogisticRegression()
model.fit(X_tarin, y_train)

In [27]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1])

In [28]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.5158730158730159
[[64 66]
 [56 66]]
              precision    recall  f1-score   support

           0       0.53      0.49      0.51       130
           1       0.50      0.54      0.52       122

    accuracy                           0.52       252
   macro avg       0.52      0.52      0.52       252
weighted avg       0.52      0.52      0.52       252



In [29]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_tarin, y_train)

In [30]:
y_tpred = tree_model.predict(X_test)
y_tpred

array([0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0])

In [31]:
print(accuracy_score(y_test, y_tpred))
print(confusion_matrix(y_test, y_tpred))
print(classification_report(y_test, y_tpred))

0.6428571428571429
[[83 47]
 [43 79]]
              precision    recall  f1-score   support

           0       0.66      0.64      0.65       130
           1       0.63      0.65      0.64       122

    accuracy                           0.64       252
   macro avg       0.64      0.64      0.64       252
weighted avg       0.64      0.64      0.64       252

