In [30]:
import warnings

# Ignore DeprecationWarning
warnings.filterwarnings("ignore")

  and should_run_async(code)


In [31]:
import numpy as np
import pandas as pd
import matplotlib as mtp

In [32]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
df = pd.read_csv('/content/drive/MyDrive/invoices contained.csv')

print(df.shape)
print('-'*100)
print(df.head())

(15522, 8)
----------------------------------------------------------------------------------------------------
   customer_id  order_date  invoice_id  course_id  free    amount  \
0         6293  1402-05-11       49587         33     1  40000000   
1    232379492  1402-05-11       49596          4     1  24900000   
2    232379492  1402-05-11       49596          5     1   9900000   
3    232390775  1402-05-11       49597          5     1   9900000   
4    232369891  1402-05-11       49600         33     1  40000000   

   discount_percent  payable_amount  
0                56             0.0  
1                56             0.0  
2                56             0.0  
3                56             0.0  
4                56             0.0  


In [34]:
# Specify the course IDs to remove
courses_to_remove = [1, 3, 10, 11, 12, 29, 30, 31, 32, 200]

# Filter the DataFrame to exclude rows with the specified course IDs
df = df[~df['course_id'].isin(courses_to_remove)]

In [35]:
# Calculate the number of unique 'order_date' items for each 'customer_id' and add 1 to each count
unique_order_date_counts = df.groupby('customer_id')['order_date'].nunique()

# Sort the results in descending order and get the top 5 customer IDs
top_5_customers = unique_order_date_counts.sort_values(ascending=False).head(5)

# Print the top 5 customer IDs and their counts of unique order dates
print(top_5_customers)

customer_id
232370734    5
232333596    5
232375939    5
232370666    4
232370665    4
Name: order_date, dtype: int64


In [36]:
# Count the unique customer IDs
unique_customer_count = df['customer_id'].nunique()

# Print the counts
print("Total unique customer IDs:", unique_customer_count)

Total unique customer IDs: 6616


In [37]:
not_free_courses = df[df['free'] == 0]

# Count the occurrences of each unique course ID among the free courses
course_counts = not_free_courses['course_id'].value_counts()

# Print the top 5 course IDs and their counts
top_5_not_free_courses = course_counts.head(5)
print("Top 5 Courses Not Freely Given to Customers:")
print(top_5_not_free_courses)

Top 5 Courses Not Freely Given to Customers:
18    2202
35    2089
36    2064
21    1111
22    1110
Name: course_id, dtype: int64


In [38]:
unique_course_count = df['course_id'].nunique()

# Print the count of unique course IDs
print("Total unique course IDs:", unique_course_count)

Total unique course IDs: 24


In [39]:
# Drop rows where 'free' is 1 (indicating a free course)
df_paid = df[df['free'] == 0]

# Create a transaction list with unique customer-bought course IDs
transaction_list = df_paid.groupby('customer_id')['course_id'].unique().reset_index()

# Print the first few rows of the transaction list
print(transaction_list.head(5))

print('-'*100)
print(len(transaction_list))

   customer_id      course_id
0            5            [2]
1            6            [2]
2           30       [21, 22]
3           72         [9, 2]
4           87  [4, 6, 2, 17]
----------------------------------------------------------------------------------------------------
6611


In [40]:
# Drop the 'customer_id' column from the transaction list
transaction_list = transaction_list['course_id']

# Print the first few rows of the transaction list (now only containing 'course_id')
print(transaction_list.head(5))

0              [2]
1              [2]
2         [21, 22]
3           [9, 2]
4    [4, 6, 2, 17]
Name: course_id, dtype: object


In [None]:
!pip install mlxtend


In [44]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [45]:
# Define a range of values to search for min_support, min_threshold, and metric
min_support_range = np.linspace(0.01, 0.1, 10)  # Adjust the range as needed
min_threshold_range = [0.8, 0.85, 0.9, 0.95]  # Adjust the range as needed
metric_list = ['lift', 'confidence']  # Experiment with other metrics if needed

best_recommendation_package = None
best_metrics = None
best_min_support = None
best_min_threshold = None

for min_support in min_support_range:
    for min_threshold in min_threshold_range:
        for chosen_metric in metric_list:
            # Convert the transaction list into a binary-encoded DataFrame
            oht = transaction_list.apply(lambda x: pd.Series(1, index=x))
            oht.fillna(0, inplace=True)

            # Use Apriori to find frequent itemsets
            frequent_itemsets = apriori(oht, min_support=min_support, use_colnames=True)

            # Generate association rules with the chosen metric
            rules = association_rules(frequent_itemsets, metric=chosen_metric, min_threshold=min_threshold)

            # Sort the rules by the chosen metric in descending order
            sorted_rules = rules.sort_values(by=[chosen_metric], ascending=False)

            # Extract the top 4 recommended courses from the rules
            top_courses = sorted_rules['antecedents'].explode().unique()[:4]

            # Example criteria: Maximize confidence and lift while minimizing support
            if best_recommendation_package is None or (chosen_metric == 'lift' and sorted_rules.shape[0] > best_metrics.shape[0]):
                best_recommendation_package = top_courses
                best_metrics = sorted_rules
                best_min_support = min_support
                best_min_threshold = min_threshold

# Print the best recommendation package and its associated metrics
print("Best Recommendation Package (Top 4 Courses):")
print(best_recommendation_package)
print('-'*100)
print("\nBest Metrics (Rules):")
print(best_metrics)
print('-'*100)
print("\nBest min_support:", best_min_support)
print('-'*100)
print("Best min_threshold:", best_min_threshold)

Best Recommendation Package (Top 4 Courses):
[24 22 21 23]
----------------------------------------------------------------------------------------------------

Best Metrics (Rules):
   antecedents consequents  antecedent support  consequent support   support  \
69    (24, 22)    (21, 23)            0.010135            0.010588  0.010135   
70    (21, 23)    (24, 22)            0.010588            0.010135  0.010135   
68    (24, 21)    (22, 23)            0.010286            0.010740  0.010135   
71    (22, 23)    (24, 21)            0.010740            0.010286  0.010135   
58    (36, 21)    (18, 22)            0.031463            0.027227  0.019362   
..         ...         ...                 ...                 ...       ...   
38        (18)    (36, 22)            0.331266            0.031765  0.019362   
3         (36)        (18)            0.311299            0.331266  0.146574   
2         (18)        (36)            0.331266            0.311299  0.146574   
46        (18)   