In [29]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import pearsonr

In [40]:
df= pd.read_excel('sample1.xlsx')
df.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [41]:
#calculating mutual information
# Select the target variable and predictor variables
target_variable = "Quantity"
predictor_variables = ["StockCode", "Description", "UnitPrice", "CustomerID"]

In [42]:
# Subset the DataFrame to include only the relevant columns
data = df[[target_variable] + predictor_variables]

# Drop any rows with missing values
data = data.dropna()

# Convert the categorical variables to numeric labels
for variable in predictor_variables:
    data[variable] = data[variable].astype('category').cat.codes

# Separate the target variable from the predictor variables
X = data[predictor_variables]
y = data[target_variable]

# Calculate chi-square scores
chi_scores = np.array(chi2(X, y)[1])  # Use the p-values from chi2 test

# Print the chi-square scores for each predictor variable
for variable, score in zip(predictor_variables, chi_scores):
    print(f"Chi-Square p-value for {variable}: {score}")

Chi-Square p-value for StockCode: 5.507066916944055e-24
Chi-Square p-value for Description: 1.2237974663978213e-06
Chi-Square p-value for UnitPrice: 5.206291980965743e-15
Chi-Square p-value for CustomerID: 7.753594988280206e-06


 Used chi function instead of mutaul information to find the dependent variables, since they are categorical and not numerical.
  calculated the chi-square scores for each predictor variable,indicating the relevance of each variable with respect to the categorical target variable (e.g., "Quantity").
  Higher chi-square scores means a stronger association between the predictor and target variables. (According to our research question - they are needed to identify sales behavior)
  So description is not related to quanity(which we set as a target variable to identify sales behavior)

In [43]:
# Calculate the mutual information scores
mi_scores = np.array(mutual_info_regression(X, y))

# Calculate the correlation coefficients
correlation_scores = np.array([abs(pearsonr(X[variable], y)[0]) for variable in predictor_variables])

# Define the weights for each measure
chi_weight = 1.0
mi_weight = 0.5
correlation_weight = 0.2

# Calculate the relevance scores
relevance_scores = chi_weight * chi_scores + mi_weight * mi_scores + correlation_weight * correlation_scores

# Print the relevance scores for each predictor variable
for variable, score in zip(predictor_variables, relevance_scores):
    print(f"Relevance score for {variable}: {score}")

Relevance score for StockCode: 0.11889343187655863
Relevance score for Description: 0.0008127222122056888
Relevance score for UnitPrice: 0.23414804127703498
Relevance score for CustomerID: 0.17641567678683806


Now let's try to improve the relevance score
1) Introducing new coloumn sales data.

In [48]:
df_new = df.copy()
df_new['TotalSales'] = df_new.groupby('CustomerID')['Quantity'].transform('sum')
# Calculate average unit price per stock code
df_new['AvgUnitPrice'] = df_new.groupby('StockCode')['UnitPrice'].transform('mean')

# Extract month, day, and hour from InvoiceDate
df_new['InvoiceMonth'] = df_new['InvoiceDate'].dt.month
df_new['InvoiceDay'] = df_new['InvoiceDate'].dt.day
df_new['InvoiceHour'] = df_new['InvoiceDate'].dt.hour


# Calculate the number of unique products purchased per customer
df_new['UniqueProducts'] = df_new.groupby('CustomerID')['StockCode'].transform('nunique')

# Print the updated DataFrame with the new features
print(df_new.head(15))




    InvoiceNo StockCode                          Description  Quantity  \
0      536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1      536365     71053                  WHITE METAL LANTERN         6   
2      536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3      536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4      536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   
5      536365     22752         SET 7 BABUSHKA NESTING BOXES         2   
6      536365     21730    GLASS STAR FROSTED T-LIGHT HOLDER         6   
7      536366     22633               HAND WARMER UNION JACK         6   
8      536366     22632            HAND WARMER RED POLKA DOT         6   
9      536367     84879        ASSORTED COLOUR BIRD ORNAMENT        32   
10     536367     22745           POPPY'S PLAYHOUSE BEDROOM          6   
11     536367     22748            POPPY'S PLAYHOUSE KITCHEN         6   
12     536367     22749    FELTCRAFT P

In [49]:
#calculating mutual information
# Select the target variable and predictor variables
target_variable = "Quantity"
predictor_variables = ["StockCode", "Description", "UnitPrice", "CustomerID"]

In [50]:
# Subset the DataFrame to include only the relevant columns
data = df_new[[target_variable] + predictor_variables]

# Drop any rows with missing values
data = data.dropna()

# Convert the categorical variables to numeric labels
for variable in predictor_variables:
    data[variable] = data[variable].astype('category').cat.codes

# Separate the target variable from the predictor variables
X = data[predictor_variables]
y = data[target_variable]

# Calculate chi-square scores
chi_scores = np.array(chi2(X, y)[1])  # Use the p-values from chi2 test

# Print the chi-square scores for each predictor variable
for variable, score in zip(predictor_variables, chi_scores):
    print(f"Chi-Square p-value for {variable}: {score}")

Chi-Square p-value for StockCode: 5.507066916944055e-24
Chi-Square p-value for Description: 1.2237974663978213e-06
Chi-Square p-value for UnitPrice: 5.206291980965743e-15
Chi-Square p-value for CustomerID: 7.753594988280206e-06


In [51]:
# Calculate the mutual information scores
mi_scores = np.array(mutual_info_regression(X, y))

# Calculate the correlation coefficients
correlation_scores = np.array([abs(pearsonr(X[variable], y)[0]) for variable in predictor_variables])

# Define the weights for each measure
chi_weight = 1.0
mi_weight = 0.5
correlation_weight = 0.2

# Calculate the relevance scores
relevance_scores = chi_weight * chi_scores + mi_weight * mi_scores + correlation_weight * correlation_scores

# Print the relevance scores for each predictor variable
for variable, score in zip(predictor_variables, relevance_scores):
    print(f"Relevance score for {variable}: {score}")

Relevance score for StockCode: 0.09059645924135082
Relevance score for Description: 0.011526369668114326
Relevance score for UnitPrice: 0.24214996490604507
Relevance score for CustomerID: 0.1606637735799456
