In [1]:
# Import necessary libraries
import pandas as pd

from mlxtend.frequent_patterns import apriori, association_rules

# 1. Introduction

## 1.1 Business Problem

Armut, recognized as Turkey's foremost online service platform, serves as a nexus connecting service providers with
those in need. It streamlines access to services such as cleaning, renovation, and moving through a simple interface on
computers or smartphones. The intention is to employ Association Rule Learning to develop a product recommendation
system based on the dataset encompassing users who have availed services and the corresponding service categories.

## 1.2 Dataset Story

The dataset comprises information about the services customers have acquired and the respective service categories.
Additionally, it includes the date and time details for each service received.

## 1.3 Features

- `UserId` - Distinct customer identifier
- `ServiceId` - Anonymized services associated with each category. For instance, within the cleaning category, a service
  could be upholstery cleaning. `ServiceId` may appear across different categories, signifying diverse services under
  distinct categories or a service with `CategoryId` = 7 and `ServiceId` = 4 could be radiator cleaning, whereas a
  service with `CategoryId` = 2 and `ServiceId` = 4 might be furniture assembly
- `CategoryId` - Anonymized categories. For instance; cleaning, moving, renovation
- `CreateDate` - The date on which the service was purchased

# 2. Data Handling

## 2.1 Loading Data

In [2]:
# Load the dataset
logs = pd.read_csv('armut_data.csv', parse_dates=['CreateDate'])

In [3]:
# Display the first few rows
logs.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate
0,25446,4,5,2017-08-06 16:11:00
1,22948,48,5,2017-08-06 16:12:00
2,10618,0,8,2017-08-06 16:13:00
3,7256,9,4,2017-08-06 16:14:00
4,25446,48,5,2017-08-06 16:16:00


## 2.2 Inspecting Data

In [4]:
# Display basic statistics about the dataset
logs.describe()

Unnamed: 0,UserId,ServiceId,CategoryId
count,162523.0,162523.0,162523.0
mean,13089.803862,21.64114,4.325917
std,7325.81606,13.774405,3.129292
min,0.0,0.0,0.0
25%,6953.0,13.0,1.0
50%,13139.0,18.0,4.0
75%,19396.0,32.0,6.0
max,25744.0,49.0,11.0


In [5]:
# Display information about the dataset
logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162523 entries, 0 to 162522
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   UserId      162523 non-null  int64         
 1   ServiceId   162523 non-null  int64         
 2   CategoryId  162523 non-null  int64         
 3   CreateDate  162523 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 5.0 MB


In [6]:
# Identify columns with null values
logs.isnull().any()

UserId        False
ServiceId     False
CategoryId    False
CreateDate    False
dtype: bool

In [7]:
# Get the number of unique values in each column
logs.nunique()

UserId         24826
ServiceId         50
CategoryId        12
CreateDate    117510
dtype: int64

## 2.3 Data Cleaning and Preprocessing

In [8]:
# Create Service column by combining ServiceId and CategoryId
logs['Service'] = [str(log[1]) + '_' + str(log[2]) for log in logs.values]

In [9]:
# Create CartId column by combining UserId and CreateDate
logs['CartId'] = [str(log[0]) + '_' + str(log[3].strftime('%Y-%m')) for log in logs.values]

In [10]:
# Display updated DataFrame
logs.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,CartId
0,25446,4,5,2017-08-06 16:11:00,4_5,25446_2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,22948_2017-08
2,10618,0,8,2017-08-06 16:13:00,0_8,10618_2017-08
3,7256,9,4,2017-08-06 16:14:00,9_4,7256_2017-08
4,25446,48,5,2017-08-06 16:16:00,48_5,25446_2017-08


In [11]:
# Create invoices pivot table
invoices = logs.groupby(['CartId', 'Service'])['Service'].count().unstack().applymap(
    lambda value: 1 if value > 0 else 0)

# Display pivot table
invoices.head()

Service,0_8,10_9,11_11,12_7,13_11,14_7,15_1,16_8,17_5,18_4,...,46_4,47_7,48_5,49_1,4_5,5_11,6_7,7_3,8_5,9_4
CartId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0_2017-08,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
0_2017-09,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
0_2018-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
0_2018-04,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10000_2017-08,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


# 3. Data Analysis

In [12]:
# Convert the invoices DataFrame to boolean type
invoices_bool = invoices.astype(bool)

# Find frequent item sets using Apriori algorithm with a minimum support of 0.01
frequent_itemsets = apriori(invoices_bool, min_support=0.01, use_colnames=True)

# Generate association rules based on support with a minimum threshold of 0.01
rules = association_rules(frequent_itemsets, metric='support', min_threshold=0.01)

# Display the first few rows of the generated association rules
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(13_11),(2_0),0.056627,0.130286,0.012819,0.226382,1.737574,0.005442,1.124216,0.449965
1,(2_0),(13_11),0.130286,0.056627,0.012819,0.098394,1.737574,0.005442,1.046325,0.488074
2,(2_0),(15_1),0.130286,0.120963,0.033951,0.260588,2.154278,0.018191,1.188833,0.616073
3,(15_1),(2_0),0.120963,0.130286,0.033951,0.280673,2.154278,0.018191,1.209066,0.609539
4,(33_4),(15_1),0.02731,0.120963,0.011233,0.411311,3.400299,0.007929,1.493211,0.725728


In [13]:
def generate_recommendations(rules, target, num_recommendations=1):
    """
    Generate recommendations based on association rules.

    Parameters:
    - rules (DataFrame): DataFrame containing association rules.
    - target (str): The service for which recommendations are sought.
    - num_recommendations (int): Number of recommendations to return.

    Returns:
    - list: List of recommended services.
    """
    # Sort rules by lift in descending order to prioritize more relevant recommendations
    sorted_rules = rules.sort_values('lift', ascending=False)

    # Initialize an empty list to store recommended services
    recommendation_list = []

    # Iterate over the antecedents (X) in the sorted rules
    for _, antecedents in sorted_rules['antecedents'].items():
        for service in antecedents:
            if service == target:
                recommendation_list.append(list(sorted_rules.iloc[_]['consequents']))

    # Flatten the recommendation list and remove duplicates using a set
    recommendation_list = list({item for item_list in recommendation_list for item in item_list})

    return recommendation_list[:num_recommendations]


# Generate recommendations for the given service
for service in logs['Service'].sample(5):
    recommendations = generate_recommendations(rules, service, 5)
    print(
        f"- For the service ['{service}'], {'recommendations are ' + str(recommendations) if recommendations != [] else 'there are no recommendations'}")


- For the service ['49_1'], recommendations are ['25_0']
- For the service ['2_0'], recommendations are ['22_0', '9_4', '2_0', '15_1', '38_4']
- For the service ['49_1'], recommendations are ['25_0']
- For the service ['29_0'], there are no recommendations
- For the service ['12_7'], there are no recommendations
