In [1]:
import numpy as np
import pandas as pd

In [2]:
pip install ucimlrepo



In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
online_retail = fetch_ucirepo(id=352)

# data (as pandas dataframes)
X = online_retail.data.features
y = online_retail.data.targets

# metadata
print(online_retail.metadata)

# variable information
print(online_retail.variables)


{'uci_id': 352, 'name': 'Online Retail', 'repository_url': 'https://archive.ics.uci.edu/dataset/352/online+retail', 'data_url': 'https://archive.ics.uci.edu/static/public/352/data.csv', 'abstract': 'This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.', 'area': 'Business', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate', 'Sequential', 'Time-Series'], 'num_instances': 541909, 'num_features': 6, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': None, 'index_col': ['InvoiceNo', 'StockCode'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2015, 'last_updated': 'Mon Oct 21 2024', 'dataset_doi': '10.24432/C5BW33', 'creators': ['Daqing Chen'], 'intro_paper': {'ID': 361, 'type': 'NATIVE', 'title': 'Data mining for the online retail industry: A case study of RFM model-based customer segmenta

In [4]:
data = pd.DataFrame(online_retail.data.features)

In [5]:
data

Unnamed: 0,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...
541904,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.0,France
541906,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France


In [6]:
data.isna().any()

Unnamed: 0,0
Description,True
Quantity,False
InvoiceDate,False
UnitPrice,False
CustomerID,True
Country,False


In [8]:
import pandas as pd

# Data Preprocessing

# Drop rows with missing descriptions
data_clean = data.dropna(subset=['Description'])

# Remove negative Quantity and UnitPrice values (assume returns and errors)
data_clean = data_clean[(data_clean['Quantity'] > 0) & (data_clean['UnitPrice'] > 0)]

# Convert InvoiceDate to datetime
data_clean['InvoiceDate'] = pd.to_datetime(data_clean['InvoiceDate'])

# Fill missing CustomerID with a placeholder (-1) since it's categorical
data_clean['CustomerID'].fillna(-1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_clean['CustomerID'].fillna(-1, inplace=True)


In [9]:
# Compute probability distributions
quantity_probs = data_clean['Quantity'].value_counts(normalize=True)
unitprice_probs = data_clean['UnitPrice'].value_counts(normalize=True)
country_probs = data_clean['Country'].value_counts(normalize=True)

# Compute Conditional Probability Tables (CPTs)

# CPT: Given a country, what is the probability of purchasing a certain quantity?
cpt_country_quantity = data_clean.groupby('Country')['Quantity'].value_counts(normalize=True).unstack(fill_value = 0)

# CPT: Given a customer, what is the probability of total purchase?
data_clean['TotalPurchase'] = data_clean['Quantity'] * data_clean['UnitPrice']
cpt_customer_purchase = data_clean.groupby('CustomerID')['TotalPurchase'].sum()

# CPT: Given a product description, what is the probability of its unit price?
cpt_description_price = data_clean.groupby('Description')['UnitPrice'].value_counts(normalize=True).unstack(fill_value = 0)

Unnamed: 0_level_0,TotalPurchase
CustomerID,Unnamed: 1_level_1
-1.0,1755276.64
12346.0,77183.60
12347.0,4310.00
12348.0,1797.24
12349.0,1757.55
...,...
18280.0,180.60
18281.0,80.82
18282.0,178.05
18283.0,2094.88


In [12]:
!pip install hmmlearn > /dev/null 2>&1

In [None]:
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Assume your data is in a DataFrame `data_clean`
# First, create the hidden states based on Invoice Date
data_clean['InvoiceDate'] = pd.to_datetime(data_clean['InvoiceDate'])

# Example: Create "quarter" hidden states based on the Invoice Date
data_clean['Quarter'] = data_clean['InvoiceDate'].dt.to_period('Q')

# Encode 'Quarter' as the hidden states (the time periods we want to track)
label_encoder_quarter = LabelEncoder()
data_clean['QuarterEncoded'] = label_encoder_quarter.fit_transform(data_clean['Quarter'])

# Example: Create Total Purchase as observation (you can use other features too)
data_clean['TotalPurchase'] = data_clean['Quantity'] * data_clean['UnitPrice']

# Now we define the observations, which are related to purchases
observations = data_clean[['TotalPurchase']].values  # You could also add other features

# Normalize the observations (important for Gaussian HMM)
scaler = StandardScaler()
observations_scaled = scaler.fit_transform(observations)

# Now, create the hidden states based on the 'QuarterEncoded' (or another time feature)
hidden_states = data_clean['QuarterEncoded'].values

# Step 1: Define the HMM Model with an appropriate number of hidden states
n_hidden_states = len(label_encoder_quarter.classes_) + 5  # Number of quarters
model = hmm.GaussianHMM(n_components=n_hidden_states, covariance_type="full", n_iter=1000)

# Step 2: Fit the model
model.fit(observations_scaled)

# Step 3: Predict hidden states based on observations
predicted_states = model.predict(observations_scaled)

# Step 4: Check model parameters (transition and emission probabilities)
print("Transition Matrix:")
print(model.transmat_)

print("Emission Matrix (means of each state):")
print(model.means_)

# Step 5: Evaluate the model
# We can check how the predicted states match the actual quarter-based hidden states
accuracy = np.sum(predicted_states == hidden_states) / len(hidden_states)
print(f"Prediction Accuracy: {accuracy:.4f}")

# Optionally: Visualize the results (for example, how hidden states vary over time)
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(predicted_states, label="Predicted Hidden States (Quarter)", alpha=0.7)
plt.plot(hidden_states, label="Actual Hidden States (Quarter)", alpha=0.5)
plt.legend()
plt.title("Hidden States Over Time")
plt.show()

# Step 6: Analyze customer behavior over time
data_clean['PredictedQuarter'] = label_encoder_quarter.inverse_transform(predicted_states)

# Inspect the results
print(data_clean[['CustomerID', 'InvoiceDate', 'TotalPurchase', 'PredictedQuarter']].head())


In [18]:
log_likelihood = model.score(observations)
print("Log-Likelihood: ", log_likelihood)


Log-Likelihood:  -28552901338.779343


In [19]:
accuracy = np.sum(hidden_states == states) / len(states)
print("Prediction Accuracy: ", accuracy)

Prediction Accuracy:  0.0011205348384467954
