# Assignment 3b code

## 1. Read datasets

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df_train = pd.read_csv('../data/Groceries data train.csv')
df_test = pd.read_csv('../data/Groceries data test.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
# number of unique members in each dataset
print(df_train['Member_number'].nunique())  # 3872
print(df_test['Member_number'].nunique())  # 3566

In [None]:
# check intersection of members between train and test
train_members = set(df_train['Member_number'].unique())
test_members = set(df_test['Member_number'].unique())
intersection = train_members.intersection(test_members)
print("number of common members: ", len(intersection))  # 3540
print("Repetition rate: ", len(intersection) / len(train_members))  # 0.9142561983471075

In [None]:
# print first 5 members in the intersection
print(list(intersection)[:5])

In [None]:
# group data by member number and create a list of items purchased by each member
train_data = df_train.groupby('Member_number')['itemDescription'].apply(set).apply(list).reset_index()
test_data = df_test.groupby('Member_number')['itemDescription'].apply(set).apply(list).reset_index()

In [None]:
# show first 5 members in the train data
train_data.head()

In [None]:
# show first 5 members in the test data
test_data.head()

## 2. Exploratory analysis

In [None]:
# count the number of items purchased by all members in the df_train
# show the result in a graph in descending order
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20, 6))
plt.title("Top15 items in Train dataset")
ax = sns.countplot(x='itemDescription', data=df_train, order=df_train['itemDescription'].value_counts().index[:15])

# Annotate count number on top of each bar
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

In [None]:
# count the number of items purchased by all members in the df_test
# show the result in a graph in descending order

plt.figure(figsize=(20, 6))
plt.title("Top15 items in Test dataset")
ax = sns.countplot(x='itemDescription', data=df_test, order=df_test['itemDescription'].value_counts().index[:15])

# Annotate count number on top of each bar
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

In [None]:
# Group the transactions by day of the week and count the number of transactions
transactions_by_day = df_train.groupby('day_of_week').size()

# Plot the number of transactions by day of the week
plt.figure(figsize=(10, 6))
transactions_by_day.plot(kind='bar')
plt.xlabel('Day of Week')
plt.ylabel('Number of Transactions')
plt.title('Transactions(train) Grouped by Day of Week')
plt.xticks(range(7), ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.show()

In [None]:
# Group the transactions by day of the week and count the number of transactions
transactions_by_day = df_test.groupby('day_of_week').size()

# Plot the number of transactions by day of the week
plt.figure(figsize=(10, 6))
transactions_by_day.plot(kind='bar')
plt.xlabel('Day of Week')
plt.ylabel('Number of Transactions')
plt.title('Transactions(test) Grouped by Day of Week')
plt.xticks(range(7), ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.show()

In [None]:
# show number of items purchased by all members in the train data
# x-axis: number of items purchased, y-axis: number of members
plt.figure(figsize=(15, 6))
plt.hist(train_data['itemDescription'].apply(len), bins=50)
plt.xticks(range(0, max(train_data['itemDescription'].apply(len)), 1))
plt.xlabel('Number of Items Purchased')
plt.ylabel('Count of Members')
plt.title('Number of Items Purchased by Members in Train Data')

In [None]:
# show number of items purchased by all members in the test data
# x-axis: number of items purchased, y-axis: number of members
plt.figure(figsize=(15, 6))
plt.hist(test_data['itemDescription'].apply(len), bins=50)
plt.xticks(range(0, max(test_data['itemDescription'].apply(len)), 1))
plt.xlabel('Number of Items Purchased')
plt.ylabel('Count of Members')
plt.title('Number of Items Purchased by Members in Test Data')

## 3. Frequent pattern mining

In [None]:
# pick itemsets that has more than 1 item, sort them

train_data = train_data['itemDescription'].tolist()
train_data = [sorted(list(items)) for items in train_data]
train_data = [items for items in train_data if len(items) > 1]

In [None]:
print(type(train_data))
for i in range(5):
    print(train_data[i])

In [None]:
len(train_data)

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

te = TransactionEncoder()
te_ary = te.fit(train_data).transform(train_data)
train_df = pd.DataFrame(te_ary, columns=te.columns_)
train_df

In [None]:
# mine frequent itemsets
freq_items = fpgrowth(train_df, min_support=0.002, use_colnames=True)

rules = association_rules(freq_items, metric="confidence", min_threshold=0.4)
rules

In [None]:
# get range of zhang's metric
print(rules['zhangs_metric'].min())
print(rules['zhangs_metric'].max())
rules.describe()

In [None]:
def predict_items(purchased_items):
    # create a list to store the predicted items
    predicted_items = []
    
    # iterate over the top rules
    for index, row in rules.iterrows():
        # get the items in the antecedent and consequent of the rule
        antecedent = row["antecedents"]
        consequent = row["consequents"]
        
        # check if all the items in the antecedent are in the purchased items
        if antecedent.issubset(purchased_items):
            # add the items in the consequent to the predicted items
            for item in consequent:
                if item not in purchased_items and item not in predicted_items:
                    predicted_items.append(item)
    
    # return the list of predicted items
    return predicted_items

### Example of using prediction function

In [None]:
# make a prediction for a sample set of purchased items
purchased_items = {'canned beer',
 'misc. beverages',
 'pastry',
 'pickled vegetables',
 'sausage',
 'soda',
 'yogurt'}
predicted_items = predict_items(purchased_items)

# print the predicted items to the console
print(predicted_items)

## 4. Collaborative filtering

In [None]:
# number of unique items in the dataset
df_train['itemDescription'].unique().size

In [None]:
# count the frequency of each item in the dataset, group by Member_number, itemDescription
purchase_counts = df_train.groupby(['Member_number', 'itemDescription']).size().reset_index(name='Purchase_Count')

# define an empty dataframe to store the results
members = df_train['Member_number'].unique()
items = df_train['itemDescription'].unique()

new_df = pd.DataFrame(index=members, columns=items).fillna(0)

# fill the new DataFrame with purchase counts
for _, row in purchase_counts.iterrows():
    member = row['Member_number']
    item = row['itemDescription']
    count = row['Purchase_Count']
    new_df.at[member, item] = count

new_df = new_df.reset_index().rename(columns={'index': 'Member_number'})
new_df.set_index('Member_number', inplace=True)
new_df = new_df.sort_index()
new_df.head()

### UV decomposition

In [None]:
import numpy as np

# UV decomposition function
def uv_decomposition(R, k, learning_rate, regularization):
    """
    Performs UV decomposition on the input matrix R, with a target rank of k, using stochastic gradient descent (SGD).
    Returns the decomposed matrices U and V.
    """
    # Initialize U and V with random values
    num_users, num_items = R.shape
    U = np.random.rand(num_users, k)
    V = np.random.rand(k, num_items)

    # Perform stochastic gradient descent to optimize U and V
    for epoch in range(10):
        for i in range(num_users):
            for j in range(num_items):
                if R[i, j] > 0:
                    error = R[i, j] - np.dot(U[i, :], V[:, j])
                    U[i, :] += learning_rate * (error * V[:, j] - regularization * U[i, :])
                    V[:, j] += learning_rate * (error * U[i, :] - regularization * V[:, j])

    # Return the decomposed matrices U and V
    return U, V

# RMSE calculation function
def rmse(R, U, V):
    """
    Calculates the Root Mean Squared Error (RMSE) between the actual ratings R and the predicted ratings U*V.
    """
    predicted_R = np.dot(U, V)
    error = R - predicted_R
    error = error[R > 0]  # Only consider known values
    return np.sqrt(np.mean(error**2))

In [None]:
# Convert the purchase counts DataFrame to a numpy array
R = new_df.to_numpy()
R

In [None]:
best_k = None
best_rmse = float('inf')

# Perform UV decomposition for different values of k
for k in range(1, 8):
    U, V = uv_decomposition(R, k, 0.1, 0.1)
    error = rmse(R, U, V)
    print(f"k: {k}, RMSE: {error}")
    if error < best_rmse:
        best_rmse = error
        best_k = k

print(f"Best k: {best_k}, Best RMSE: {best_rmse}")

In [None]:
# perform UV decomposition with the best k value on train data
U, V = uv_decomposition(R, best_k, 0.1, 0.1)

# Reconstruct the matrix using the decomposed matrices U and V
new_df_columns = new_df.columns
user_ratings = np.dot(U, V)
user_ratings = pd.DataFrame(user_ratings, columns=new_df_columns, index=new_df.index)
# discard ratings of items that were already purchased
user_ratings = user_ratings.where(R == 0).fillna(0)
user_ratings # ratings or interests of each member in each item

In [None]:
# normalise ratings to the range of 0-1
user_ratings = (user_ratings - user_ratings.min()) / (user_ratings.max() - user_ratings.min())
user_ratings

In [None]:
# print top5 recommended items for each member
a = []
for row in user_ratings.iterrows():
    top5 = row[1].sort_values(ascending=False).head(5)
    top5_columns = top5.index
    b = []
    for i in top5.index:
        b.append((i, top5[i]))
    a.append(b)
member_recommendations = pd.DataFrame(a, columns=["top_1", "top_2", "top_3", "top_4", "top_5"], index=user_ratings.index)
member_recommendations.head()

## 5. Recommendation methods

In [None]:
#calculate precision rate : if one of top 5 recomendation is in test dataset => count += 1
itemsets = df_test.groupby("Member_number")["itemDescription"].apply(list).reset_index()
# x = set(itemsets.iloc[0, 1])
count = 0
for i in range(itemsets.shape[0]):
  is_in = False
  for j in range(5):
    if member_recommendations.iloc[i, j][0] in set(itemsets.iloc[i, 1]):
      is_in = True
  if is_in:
    count += 1

print("precision rate:")
raw_precision = count/itemsets.shape[0]
print(raw_precision)

In [None]:
df_test

In [None]:
# # output recomendation use pattern, use predict_items function
# # df_origin = pd.read_csv('../data/Groceries data train.csv')

# pattern_recommendation = new_df.copy()
# pattern_recommendation.drop(pattern_recommendation.columns, axis=1, inplace=True)

# # itemsets = df_origin.groupby("Member_number")["itemDescription"].apply(list).reset_index()
# item_array = []
# for i in range(itemsets.shape[0]):
#   current_itemset = itemsets.iloc[i, 1]
#   predicted_items = predict_items(current_itemset)
#   item_array.append(predicted_items)
# pattern_recommendation['recommendation_item'] = item_array
# pattern_recommendation.head(5)

### Simple combination

In [None]:
# combine association results and collaborative filtering ratings to recommend
def combine_recommendations(predicted_items, uv_top5):
    """
    if the item is in the top 5 of the UV_top5, then recommend it
    if no items are in the top 5 of the UV_top5, then recommend the top 1
    """
    recommendations = []
    # sort top5 by its ratings
    uv_top5 = dict(sorted(uv_top5.items(), key=lambda x: x[1], reverse=True))
    recommendations.append([item for item in uv_top5.keys() if item in predicted_items])
    if len(recommendations[0]) == 0:
        recommendations[0].append(list(uv_top5.keys())[0])

    return recommendations

## 6. Results