In [1]:
import pandas as pd
import numpy as np

file_path = 'online_retail_II.xlsx'
df = pd.read_excel(file_path)
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      525461 non-null  object        
 1   StockCode    525461 non-null  object        
 2   Description  522533 non-null  object        
 3   Quantity     525461 non-null  int64         
 4   InvoiceDate  525461 non-null  datetime64[ns]
 5   Price        525461 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      525461 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 32.1+ MB
None
  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041    

In [2]:
# Data Cleaning
df = df.dropna(subset=['Customer ID'])
df = df[df['Quantity'] > 0]
df['TotalPrice'] = df['Quantity'] * df['Price']

df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['YearMonth'] = df['InvoiceDate'].apply(lambda x: x.strftime('%Y-%m'))
df['Month'] = df['InvoiceDate'].dt.month
df['Year'] = df['InvoiceDate'].dt.year

customer_df = df.groupby('Customer ID').agg({
    'Invoice': 'nunique',
    'Quantity': 'sum',
    'TotalPrice': 'sum',
    'InvoiceDate': ['min', 'max']
}).reset_index()

customer_df.columns = ['CustomerID', 'NumInvoices', 'TotalQuantity', 'TotalPrice', 'FirstPurchase', 'LastPurchase']

customer_df['Recency'] = (customer_df['LastPurchase'].max() - customer_df['LastPurchase']).dt.days
customer_df['Frequency'] = customer_df['NumInvoices']
customer_df['Monetary'] = customer_df['TotalPrice']

print(customer_df.head())


   CustomerID  NumInvoices  TotalQuantity  TotalPrice       FirstPurchase  \
0     12346.0           11             70      372.86 2009-12-14 08:34:00   
1     12347.0            2            828     1323.32 2010-10-31 14:20:00   
2     12348.0            1            373      222.16 2010-09-27 14:59:00   
3     12349.0            3            993     2671.14 2010-04-29 13:20:00   
4     12351.0            1            261      300.93 2010-11-29 15:23:00   

         LastPurchase  Recency  Frequency  Monetary  
0 2010-06-28 13:53:00      164         11    372.86  
1 2010-12-07 14:57:00        2          2   1323.32  
2 2010-09-27 14:59:00       73          1    222.16  
3 2010-10-28 08:23:00       42          3   2671.14  
4 2010-11-29 15:23:00       10          1    300.93  


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

customer_df['ReturningCustomer'] = (customer_df['Frequency'] > 1).astype(int)

X = customer_df[['Recency', 'Frequency', 'Monetary']]
y = customer_df['ReturningCustomer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Returning Customer Model Evaluation")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

customer_df['RepeatPurchase'] = (customer_df['TotalQuantity'] > customer_df['TotalQuantity'].mean()).astype(int)
y_repeat = customer_df['RepeatPurchase']

X_train_repeat, X_test_repeat, y_train_repeat, y_test_repeat = train_test_split(X, y_repeat, test_size=0.2, random_state=42)

model_repeat = LogisticRegression()
model_repeat.fit(X_train_repeat, y_train_repeat)

y_pred_repeat = model_repeat.predict(X_test_repeat)

print("Repeat Purchase Model Evaluation")
print("Accuracy:", accuracy_score(y_test_repeat, y_pred_repeat))
print("Precision:", precision_score(y_test_repeat, y_pred_repeat))
print("Recall:", recall_score(y_test_repeat, y_pred_repeat))
print("F1 Score:", f1_score(y_test_repeat, y_pred_repeat))

print(classification_report(y_test_repeat, y_pred_repeat))


Returning Customer Model Evaluation
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       284
           1       1.00      1.00      1.00       579

    accuracy                           1.00       863
   macro avg       1.00      1.00      1.00       863
weighted avg       1.00      1.00      1.00       863

Repeat Purchase Model Evaluation
Accuracy: 0.9432213209733488
Precision: 0.9315068493150684
Recall: 0.7771428571428571
F1 Score: 0.8473520249221184
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       688
           1       0.93      0.78      0.85       175

    accuracy                           0.94       863
   macro avg       0.94      0.88      0.91       863
weighted avg       0.94      0.94      0.94       863



In [7]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

df['TotalPrice'] = (df['TotalPrice'] - df['TotalPrice'].mean()) / df['TotalPrice'].std()
df_filtered = df[['Customer ID', 'StockCode', 'TotalPrice']]
reader = Reader(rating_scale=(df_filtered['TotalPrice'].min(), df_filtered['TotalPrice'].max()))

data = Dataset.load_from_df(df_filtered, reader)
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
trainset = data.build_full_trainset()
algo.fit(trainset)

def get_recommendations(customer_id, top_n=10):
    items = df['StockCode'].unique()
    est_ratings = []
    for item in items:
        est_rating = algo.predict(customer_id, item).est
        est_ratings.append((item, est_rating))
    est_ratings.sort(key=lambda x: x[1], reverse=True)
    top_recommendations = est_ratings[:top_n]
    
    recommendations = []
    for stock_code, rating in top_recommendations:
        product_name = df[df['StockCode'] == stock_code]['Description'].values[0]
        recommendations.append((product_name, rating))
    
    return recommendations

# Example: Get recommendations for a customer
print(get_recommendations(12346))

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6186  1.0432  0.7704  0.6018  0.9207  0.7909  0.1711  
MAE (testset)     0.1585  0.1614  0.1585  0.1623  0.1619  0.1605  0.0017  
Fit time          1.51    1.53    1.78    1.55    1.61    1.60    0.10    
Test time         0.19    0.18    0.19    0.19    0.25    0.20    0.03    
[('VINTAGE BLUE KITCHEN CABINET', 2.7398447380650484), ('ROTATING SILVER ANGELS T-LIGHT HLDR', 2.679505091524357), ('VINTAGE RED KITCHEN CABINET', 2.511150036428856), ('Manual', 1.7469766581724302), ('PINK PAPER PARASOL ', 1.4184566574886437), ('SMALL FAIRY CAKE FRIDGE MAGNETS', 1.3521500606708203), ('GIANT SEVENTEEN DRAWER SIDEBOARD', 1.3005088667239615), ('AFGHAN SLIPPER SOCK PAIR', 1.0407059539202475), ('FRENCH STYLE WALL DRESSER', 0.9487807620265926), ('BLUE KASHMIRI COFFEE TABLE ', 0.8824401272920581)]


In [6]:
!pip install scikit-surprise

[0mCollecting scikit-surprise
  Using cached scikit_surprise-1.1.4-cp311-cp311-macosx_14_0_arm64.whl
[0mInstalling collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
 import joblib
from sklearn.linear_model import LogisticRegression

joblib.dump(model, 'customer_return_model.pkl')
joblib.dump(model_repeat, 'repeat_purchase_model.pkl')

['repeat_purchase_model.pkl']