In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#read training file
df = pd.read_csv('Groceries-data-train.csv')
df.head(5)


Unnamed: 0,Member_number,Date,itemDescription,year,month,day,day_of_week
0,3021,30/01/2015,frankfurter,2015,1,30,4
1,1292,24/10/2015,pork,2015,10,24,5
2,4206,4/04/2014,root vegetables,2014,4,4,4
3,4369,25/08/2015,onions,2015,8,25,1
4,1522,1/07/2014,waffles,2014,7,1,1


In [3]:
#see how many unique item in training file
items = df['itemDescription'].unique()
items.shape


(167,)

In [4]:
#put same-day-purchased items in one row
member_df = df.groupby(['Member_number', 'Date'], as_index=False).agg({
    'itemDescription': ', '.join,
    'year': 'first',  # Retain the year (or use any other appropriate method)
    'month': 'first',  # Retain the month
    'day': 'first',  # Retain the day
    'day_of_week': 'first'  # Retain the day of the week
})
member_df.head(5)



Unnamed: 0,Member_number,Date,itemDescription,year,month,day,day_of_week
0,1000,15/03/2015,"sausage, yogurt",2015,3,15,6
1,1000,24/06/2014,pastry,2014,6,24,1
2,1000,24/07/2015,"misc. beverages, canned beer",2015,7,24,4
3,1000,25/11/2015,sausage,2015,11,25,2
4,1000,27/05/2015,"soda, pickled vegetables",2015,5,27,2


In [5]:
#a member one row for items

# Group by Member_number and itemDescription, then count the occurrences
purchase_counts = df.groupby(['Member_number', 'itemDescription']).size().reset_index(name='Purchase_Count')


# Initialize an empty DataFrame with Member_number as rows and itemDescription as columns
members = df['Member_number'].unique()
items = df['itemDescription'].unique()

new_df = pd.DataFrame(index=members, columns=items).fillna(0)

# Populate the new DataFrame with purchase counts
for _, row in purchase_counts.iterrows():
    member = row['Member_number']
    item = row['itemDescription']
    count = row['Purchase_Count']
    new_df.at[member, item] = count

# Reset the index to make Member_number a column
new_df = new_df.reset_index().rename(columns={'index': 'Member_number'})



In [6]:
#sort by member-number
new_df.set_index('Member_number', inplace=True)
new_df = new_df.sort_index()
new_df.shape
new_df.head(5)

Unnamed: 0_level_0,frankfurter,pork,root vegetables,onions,waffles,cereals,yogurt,sausage,rolls/buns,chocolate,...,organic sausage,house keeping products,frozen fruits,bathroom cleaner,skin care,bags,rubbing alcohol,make up remover,preservation products,kitchen utensil
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0,0,0,0,0,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,1,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1003,0,0,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1004,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF
Using tf-idf method to calculate how important an item for a member to see wether we can get better result

In [19]:
#calculate TF
tf = new_df.copy()
# Count the number of 0s in each row
tf = tf.apply(lambda row: row / (167 - (row == 0).sum()), axis=1)


  and should_run_async(code)


In [20]:
#caluculate IDF
columns = tf.columns
idf = []
for i in columns:
  idf.append(np.log10(3872 / (tf[i]!=0).sum())) # number of member / number of member purchse this item

  and should_run_async(code)


In [45]:
#combine TF and IDF
tf_idf = tf.apply(lambda row: row * idf, axis=1)
tf_idf.head(5)

  and should_run_async(code)


Unnamed: 0_level_0,frankfurter,pork,root vegetables,onions,waffles,cereals,yogurt,sausage,rolls/buns,chocolate,...,organic sausage,house keeping products,frozen fruits,bathroom cleaner,skin care,bags,rubbing alcohol,make up remover,preservation products,kitchen utensil
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0.0,0.0,0.0,0.0,0.0,0.0,0.098068,0.236864,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001,0.110394,0.0,0.0,0.0,0.0,0.0,0.0,0.092114,0.064418,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1003,0.0,0.0,0.154206,0.0,0.0,0.0,0.0,0.165805,0.115952,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044597,0.092077,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# UV decomposition

In [21]:
# build UV decomposition
def uv_decomposition(R, k, learning_rate, regularization):
    """
    Performs UV decomposition on the input matrix R, with a target rank of k, using stochastic gradient descent (SGD).
    Returns the decomposed matrices U and V.
    """
    # Initialize U and V with random values
    num_users, num_items = R.shape
    U = np.random.rand(num_users, k)
    V = np.random.rand(k, num_items)

    # Perform stochastic gradient descent to optimize U and V
    for epoch in range(10):
        for i in range(num_users):
            for j in range(num_items):
                if R[i, j] > 0:
                    error = R[i, j] - np.dot(U[i, :], V[:, j])
                    U[i, :] += learning_rate * (error * V[:, j] - regularization * U[i, :])
                    V[:, j] += learning_rate * (error * U[i, :] - regularization * V[:, j])

    # Return the decomposed matrices U and V
    return U, V

numpy_array = new_df.to_numpy()


  and should_run_async(code)


In [40]:
#To choose best k by RMSE
def calculate_rmse(R, U, V):
    predicted_R = np.dot(U, V)
    error = R - predicted_R
    error = error[R > 0]  # Only consider known values
    return np.sqrt(np.mean(error**2))

k_values = [1,2,3,4,5,6,7,8,9]

# Split the data into training and validation sets

best_k = None
best_rmse = float('inf')

for k in k_values:
    U, V = uv_decomposition(numpy_array, k, 0.1, 0.1)
    rmse = calculate_rmse(numpy_array, U, V)
    print(f"k={k}, RMSE={rmse}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_k = k

print(f"Best k: {best_k} with RMSE: {best_rmse}")

  and should_run_async(code)


k=1, RMSE=0.29878288392241653
k=2, RMSE=0.2982847953509803
k=3, RMSE=0.29784849381336337
k=4, RMSE=0.2966691820852954
k=5, RMSE=0.29730149538829903
k=6, RMSE=0.2990599310125329
k=7, RMSE=0.2975384072859363
k=8, RMSE=0.29866949663947767
k=9, RMSE=0.3016184958774464
Best k: 4 with RMSE: 0.2966691820852954


In [41]:
# Perform UV decomposition on "raw training data"
U, V = uv_decomposition(numpy_array, k=best_k, learning_rate=0.1, regularization=0.1)

# Reconstruct the matrix using the decomposed matrices U and V
new_df_columns = new_df.columns
numpy_array_reconstructed = np.dot(U, V)
df_reconstructed = pd.DataFrame(numpy_array_reconstructed, columns=new_df_columns)
df_reconstructed.index = new_df.index

#remove the items customer already bought
assist_df = new_df.apply(lambda row: row.apply(lambda x: 0 if x != 0 else 1), axis=1)
df_reconstructed = assist_df*df_reconstructed


  and should_run_async(code)


In [42]:
#print recommendation
a=[]
for i in range(df_reconstructed.shape[0]):
  row = df_reconstructed.iloc[i]
  top_5_values = row.nlargest(5)
  top_5_columns = top_5_values.index
  b = []
  for ii in range(5):
      b.append((top_5_columns[ii], top_5_values[ii]))
  a.append(b)
member_recomendation = pd.DataFrame(index=new_df.index, columns=["top_1", "top_2", "top_3", "top_4", "top_5"], data = a)
member_recomendation.head(5)

  and should_run_async(code)


Unnamed: 0_level_0,top_1,top_2,top_3,top_4,top_5
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,"(margarine, 1.3082271198249733)","(preservation products, 1.2855658210153031)","(rolls/buns, 1.1935428039422735)","(other vegetables, 1.1934063485451725)","(soft cheese, 1.118494706519626)"
1001,"(margarine, 1.4492929350820847)","(yogurt, 1.356157625391391)","(other vegetables, 1.3478335716268381)","(preservation products, 1.3115445493391817)","(pip fruit, 1.250879945640325)"
1002,"(margarine, 1.1680219517835857)","(other vegetables, 1.127224420925145)","(yogurt, 1.0971583405388787)","(sausage, 1.0613581865000803)","(kitchen utensil, 1.0088289129188732)"
1003,"(margarine, 1.1422646924861186)","(preservation products, 1.1366702287234944)","(yogurt, 1.0591754131166573)","(other vegetables, 1.0140183870052553)","(pip fruit, 1.0106245135234002)"
1004,"(margarine, 1.291869962835066)","(yogurt, 1.2090056680251327)","(sausage, 1.1726059956854897)","(preservation products, 1.1715978829438383)","(soft cheese, 1.099217064150257)"


In [46]:
# Perform UV decomposition on "tf_idf training data"

tfidf_numpy_array = tf_idf.to_numpy()

U_, V_ = uv_decomposition(tfidf_numpy_array, k=best_k, learning_rate=0.1, regularization=0.1)

# Reconstruct the matrix R using the decomposed matrices U and V
tfidf_numpy_array_reconstructed = np.dot(U_, V_)
df_tfidf = pd.DataFrame(tfidf_numpy_array_reconstructed, columns=new_df_columns)
df_tfidf.index = new_df.index

#remove the items customer already bought
assist_df = new_df.apply(lambda row: row.apply(lambda x: 0 if x != 0 else 1), axis=1)
df_tfidf = assist_df*df_tfidf
df_tfidf.head(5)

  and should_run_async(code)


Unnamed: 0_level_0,frankfurter,pork,root vegetables,onions,waffles,cereals,yogurt,sausage,rolls/buns,chocolate,...,organic sausage,house keeping products,frozen fruits,bathroom cleaner,skin care,bags,rubbing alcohol,make up remover,preservation products,kitchen utensil
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0.066243,0.115196,0.073391,0.087642,0.114352,0.131872,0.0,0.0,0.052574,0.076618,...,0.185165,0.139342,0.382986,0.202231,0.118969,0.196687,0.250654,0.23373,0.153556,0.316161
1001,0.0,0.102036,0.060021,0.081747,0.100103,0.111255,0.056451,0.0,0.0,0.063526,...,0.155034,0.119073,0.359775,0.191708,0.086601,0.22646,0.226012,0.255122,0.135803,0.28372
1002,0.098027,0.163734,0.103453,0.141166,0.165556,0.187445,0.091878,0.117852,0.077715,0.108839,...,0.23248,0.199069,0.559979,0.308001,0.1394,0.347813,0.313232,0.374591,0.316911,0.396386
1003,0.097573,0.156734,0.0,0.137655,0.154353,0.193054,0.085914,0.0,0.0,0.115629,...,0.205877,0.194596,0.45694,0.249167,0.145471,0.234265,0.26333,0.281502,0.353015,0.341082
1004,0.034325,0.06219,0.038381,0.043626,0.061405,0.069032,0.034334,0.045984,0.0,0.0,...,0.108057,0.07408,0.214527,0.111087,0.067734,0.107372,0.149878,0.130353,0.054418,0.187711


In [47]:
#print recommendation
z=[]
for i in range(df_tfidf.shape[0]):
  row = df_tfidf.iloc[i]
  top_5_values = row.nlargest(5)
  top_5_columns = top_5_values.index
  y = []
  for ii in range(5):
      y.append((top_5_columns[ii], top_5_values[ii]))
  z.append(y)
tfidf_member_recomendation = pd.DataFrame(index=new_df.index, columns=["top_1", "top_2", "top_3", "top_4", "top_5"], data = z)
tfidf_member_recomendation.head(5)

  and should_run_async(code)


Unnamed: 0_level_0,top_1,top_2,top_3,top_4,top_5
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,"(organic products, 0.41577339077064596)","(frozen fruits, 0.38298578612968226)","(cooking chocolate, 0.3418215272706751)","(cream, 0.3224324472727082)","(kitchen utensil, 0.31616084561080776)"
1001,"(organic products, 0.3601222306025819)","(frozen fruits, 0.35977501364250297)","(kitchen utensil, 0.28371955865556825)","(cream, 0.26999796933167947)","(cooking chocolate, 0.26801940158080073)"
1002,"(organic products, 0.5898869347813257)","(frozen fruits, 0.5599793558096743)","(cream, 0.4342941373827597)","(cooking chocolate, 0.42340375266476027)","(kitchen utensil, 0.39638594081468087)"
1003,"(organic products, 0.5710613924306783)","(cooking chocolate, 0.48149860811635675)","(frozen fruits, 0.45693969867590084)","(nut snack, 0.3871458823406847)","(cream, 0.3866573982326153)"
1004,"(organic products, 0.22313307736057755)","(frozen fruits, 0.21452706616118888)","(kitchen utensil, 0.1877113356609454)","(cooking chocolate, 0.18641710951826942)","(cream, 0.18141040335210848)"


# Measure the performance.

In [22]:
df_test = pd.read_csv('Groceries-data-test.csv')
df_test.head(5)

  and should_run_async(code)


Unnamed: 0,Member_number,Date,itemDescription,year,month,day,day_of_week
0,3481,8/03/2015,candy,2015,3,8,6
1,1254,19/04/2015,white wine,2015,4,19,6
2,2835,28/01/2014,domestic eggs,2014,1,28,1
3,2854,2/08/2015,coffee,2015,8,2,6
4,4637,12/08/2014,bottled water,2014,8,12,1


In [23]:
#a member one row for items

# Group by Member_number and itemDescription, then count the occurrences
purchase_counts = df_test.groupby(['Member_number', 'itemDescription']).size().reset_index(name='Purchase_Count')


# Initialize an empty DataFrame with Member_number as rows and itemDescription as columns
members = df_test['Member_number'].unique()
items = df_test['itemDescription'].unique()

new_df_test = pd.DataFrame(index=members, columns=items).fillna(0)

# Populate the new DataFrame with purchase counts
for _, row in purchase_counts.iterrows():
    member = row['Member_number']
    item = row['itemDescription']
    count = row['Purchase_Count']
    new_df_test.at[member, item] = count

# Reset the index to make Member_number a column
new_df_test = new_df_test.reset_index().rename(columns={'index': 'Member_number'})

#sort by member-number
new_df_test.set_index('Member_number', inplace=True)
new_df_test = new_df_test.sort_index()
new_df_test.head(5)


  and should_run_async(code)


Unnamed: 0_level_0,candy,white wine,domestic eggs,coffee,bottled water,long life bakery product,soda,frankfurter,pip fruit,tropical fruit,...,frozen chicken,make up remover,brandy,cooking chocolate,bags,canned fruit,pudding powder,potato products,soap,honey
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1004,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#print recommendation
a=[]
for i in range(df_reconstructed.shape[0]):
  row = df_reconstructed.iloc[i]
  top_5_values = row.nlargest(5)
  top_5_columns = top_5_values.index
  b = []
  for ii in range(5):
      b.append((top_5_columns[ii], top_5_values[ii]))
  a.append(b)
member_recomendation = pd.DataFrame(index=new_df.index, columns=["top_1", "top_2", "top_3", "top_4", "top_5"], data = a)
member_recomendation.head(5)

Unnamed: 0_level_0,top_1,top_2,top_3,top_4,top_5
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,"(margarine, 1.2849648535978768)","(other vegetables, 1.201540411669832)","(kitchen utensil, 1.1045171861784642)","(soft cheese, 1.1004399112548309)","(pip fruit, 1.0995472685951406)"
1001,"(margarine, 1.4406389001002426)","(yogurt, 1.3540748292140286)","(other vegetables, 1.316506651009498)","(soft cheese, 1.2290707878622753)","(pip fruit, 1.2245451323414953)"
1002,"(margarine, 1.1832694214309516)","(other vegetables, 1.1448411516729162)","(yogurt, 1.1106298616568135)","(sausage, 1.0912656892046342)","(soft cheese, 1.021887175981817)"
1003,"(margarine, 1.1097777614379831)","(yogurt, 1.0256392647362138)","(other vegetables, 1.014346646920453)","(frozen chicken, 0.9813566082816138)","(pip fruit, 0.9660646845915389)"
1004,"(margarine, 1.2903247393140707)","(yogurt, 1.2299844235589805)","(sausage, 1.1711872057166286)","(preservation products, 1.1238265523242652)","(soda, 1.0997021050306057)"


In [None]:
#calculate precise rate : if item of recomendation is in test dataset => count +=1
itemsets = df_test.groupby("Member_number")["itemDescription"].apply(list).reset_index()
x = set(itemsets.iloc[0, 1])
count = 0
for i in range(itemsets.shape[0]):
  recomendation_item = member_recomendation.iloc[i, 0]
  if recomendation_item[0] in set(itemsets.iloc[i, 1]):
    count += 1

print("precise rate:")
raw_precise = count/itemsets.shape[0]
print(raw_precise)


precise rate:
0.044868199663488505


In [None]:
#print recommendation from tf-idf
z=[]
for i in range(df_tfidf.shape[0]):
  row = df_tfidf.iloc[i]
  top_5_values = row.nlargest(5)
  top_5_columns = top_5_values.index
  y = []
  for ii in range(5):
      y.append((top_5_columns[ii], top_5_values[ii]))
  z.append(y)
tfidf_member_recomendation = pd.DataFrame(index=new_df.index, columns=["top_1", "top_2", "top_3", "top_4", "top_5"], data = z)
tfidf_member_recomendation.head(5)

Unnamed: 0_level_0,top_1,top_2,top_3,top_4,top_5
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,"(cooking chocolate, 0.41080111598520475)","(baby cosmetics, 0.3572999467856195)","(frozen fruits, 0.3408977377871869)","(preservation products, 0.3275686062439422)","(organic products, 0.29429333828934856)"
1001,"(preservation products, 0.31425391465803953)","(cream, 0.2928758713022358)","(cooking chocolate, 0.26614323234164006)","(frozen chicken, 0.2284968965909826)","(organic products, 0.19939229226684532)"
1002,"(organic products, 0.6790411256115658)","(frozen fruits, 0.5570196363816463)","(preservation products, 0.5020743998732272)","(cream, 0.49559994415434333)","(cooking chocolate, 0.48110545643306435)"
1003,"(preservation products, 0.6104304546369154)","(organic products, 0.5909800009931706)","(cream, 0.5705579324389479)","(cooking chocolate, 0.5612526189917846)","(frozen chicken, 0.47835334463570583)"
1004,"(preservation products, 0.2008378640696126)","(cream, 0.18505071089871628)","(cooking chocolate, 0.1833504156013449)","(organic products, 0.1752087630848635)","(frozen chicken, 0.15751146213815245)"


In [None]:
#calculate precise rate : if item of recomendation is in test dataset => count +=1

itemsets = df_test.groupby("Member_number")["itemDescription"].apply(list).reset_index()
x = set(itemsets.iloc[0, 1])
count = 0
for i in range(itemsets.shape[0]):
  recomendation_item = tfidf_member_recomendation.iloc[i, 0]
  if recomendation_item[0] in set(itemsets.iloc[i, 1]):
    count += 1

print("precise rate:")
tfidf_precise = count/itemsets.shape[0]

print(tfidf_precise)


precise rate:
0.0008412787436904094


In [None]:
print(f"raw data rmse: {raw_precise}")
print(f"tf-idf data rmse: {tfidf_precise}")

raw data rmse: 0.044868199663488505
tf-idf data rmse: 0.0008412787436904094


*Don't do tf-idf to data will get better result.*

# Make recommendation from pattern(task 1)

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth

In [34]:
#from task 1

data = pd.read_csv('Groceries-data-train.csv')

# Group the data by 'Member_number' and 'Date', then aggregate the 'itemDescription' into lists
transactions_data = data.groupby(['Member_number'])['itemDescription'].apply(list).reset_index()

# Extract the list of transactions
transactions = transactions_data['itemDescription'].tolist()

sorted_transactions = [sorted(list(set(transaction))) for transaction in transactions]

transactions = sorted_transactions

transactions = [transaction for transaction in transactions if len(transaction) > 1]

# Display the first transaction as an example
transactions[0]

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

  and should_run_async(code)


In [12]:
#from task 1
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(df, min_support=0.002, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)
rules.head(5)


  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Instant food products),(root vegetables),0.011318,0.175963,0.004581,0.404762,2.300263,0.002589,1.384382,0.571738
1,(Instant food products),(soda),0.011318,0.243331,0.00485,0.428571,1.761272,0.002097,1.324171,0.437176
2,(Instant food products),(whole milk),0.011318,0.359472,0.004581,0.404762,1.125991,0.000513,1.076087,0.113174
3,(UHT-milk),(whole milk),0.054702,0.359472,0.023713,0.433498,1.205929,0.004049,1.130671,0.180646
4,(artif. sweetener),(other vegetables),0.005659,0.292643,0.002425,0.428571,1.464483,0.000769,1.237874,0.31897


In [13]:
#from task 1

def predict_items(purchased_items):
    # create a list to store the predicted items
    predicted_items = []

    # iterate over the top rules
    for index, row in rules.iterrows():
        # get the items in the antecedent and consequent of the rule
        antecedent = row["antecedents"]
        consequent = row["consequents"]

        # check if all the items in the antecedent are in the purchased items
        if antecedent.issubset(purchased_items):
            # add the items in the consequent to the predicted items
            for item in consequent:
                if item not in purchased_items and item not in predicted_items:
                    predicted_items.append(item)

    # return the list of predicted items
    return predicted_items

  and should_run_async(code)


In [38]:
# make a prediction for a sample set of purchased items
purchased_items = {'canned beer',
 'misc. beverages',
 'pastry',
 'pickled vegetables',
 'sausage',
 'soda',
 'yogurt'}
predicted_items = predict_items(purchased_items)

# print the predicted items to the console
print(predicted_items)

  and should_run_async(code)


['whole milk', 'rolls/buns', 'other vegetables', 'pip fruit']


In [37]:
#output recomendation set
df_origin = pd.read_csv('Groceries-data-train.csv')

pattern_recommendation = new_df.copy()
pattern_recommendation.drop(pattern_recommendation.columns, axis=1, inplace=True)

itemsets = df_origin.groupby("Member_number")["itemDescription"].apply(list).reset_index()
item_array = []
for i in range(itemsets.shape[0]):
  current_itemset = itemsets.iloc[i, 1]
  predicted_items = predict_items(current_itemset)
  item_array.append(predicted_items)
pattern_recommendation['recommendation_item'] = item_array
pattern_recommendation.head(5)

  and should_run_async(code)


Unnamed: 0_level_0,recommendation_item
Member_number,Unnamed: 1_level_1
1000,"[whole milk, rolls/buns, other vegetables, pip..."
1001,"[other vegetables, yogurt, pip fruit]"
1002,"[rolls/buns, other vegetables, yogurt, soda]"
1003,"[whole milk, other vegetables]"
1004,"[soda, root vegetables, yogurt, bottled water,..."
