# Content-based Filtering Model

In [112]:
import csv
import random
import tensorflow as tf

Create a list of brand choice for one hot encoding.

In [113]:
user_dict = {}
brand_choice_list = ['Oppo', 'Asus', 'Infinix', 'Samsung', 'Vivo', 'Huawei', 'Apple', 'Realme', 'Xiaomi', 'Poco', 'lainnya/tidak ada']

Access user dataset from csv file.

In [114]:
with open('user_dataset_revised.csv', 'r') as file:
  user_dict['user'] = []
  user_dict['user_fav'] = []
  user_index = []
  csvFile = csv.reader(file)
  for i,line in enumerate(csvFile):
    if '0' in line:
      continue
    else:
      user_index.append(i)
      user_dict[f'user'].append([int(line[0]), int(line[1]), int(line[2]), int(line[3]), int(line[4]), int(line[5])])
      user_dict[f'user_fav'].append(brand_choice_list.index(line[7]))

Convert user features to tensors and one hot encode one of user features, and then concatenate them.

In [115]:
user_feature1_6 = tf.cast(tf.convert_to_tensor(user_dict['user']), tf.float32)
user_feature7 = tf.one_hot(user_dict['user_fav'], len(brand_choice_list))
user = tf.concat([user_feature1_6, user_feature7], 1)

User features are performance, camera, battery, software, RAM, storage, and favorite brand.

In [116]:
user[0]

<tf.Tensor: shape=(17,), dtype=float32, numpy=
array([1., 1., 5., 1., 3., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>

Create a brand list and os list for one hot encoding.

In [117]:
phone_dict = {}
brand_list = ['Oppo', 'Asus', 'Infinix', 'Samsung', 'Vivo', 'Huawei', 'Apple', 'Realme', 'Xiaomi', 'Poco']
os_list = ['iOS', 'EMUI', 'Android']

Access phone dataset from csv file.<br>
Preprocess the data,  one hot encode some of the features, normalize some of the features.<br>
Then concatenate all the phone data.

In [118]:
with open('phone_dataset_cleaned.csv', 'r') as file:
  phone_dict = {'phone_brand': [], 'phone_name': [], 'phone_weight': [], 'phone_os': [], 'phone_chipset': [], 'phone_memory': [], 'phone_ram': [], 'phone_camera': [], 'phone_audio_jack': [], 'phone_battery': [], 'phone_charging': [], 'phone_nfc': [], 'phone_price': []}
  csvFile = csv.reader(file)
  next(csvFile)
  for i,line in enumerate(csvFile):
    phone_dict['phone_brand'].append(brand_list.index(line[1]))
    phone_dict['phone_name'].append(line[2])
    phone_dict['phone_weight'].append(float(line[6]))
    phone_dict['phone_os'].append(os_list.index('iOS' if line[7].startswith('iOS') else 'EMUI' if line[7].startswith('EMUI') else 'Android'))
    phone_dict['phone_chipset'].append(line[8])
    phone_dict['phone_memory'].append(float(line[9]))
    phone_dict['phone_ram'].append(float(line[10]))
    phone_dict['phone_camera'].append([float(line[11]), float(line[12]), float(line[13]), float(line[14])])
    phone_dict['phone_audio_jack'].append(1.0 if line[15] == 'TRUE' else 0.0)
    phone_dict['phone_battery'].append(float(line[16]))
    phone_dict['phone_charging'].append(float(line[17]))
    phone_dict['phone_nfc'].append(1.0 if line[19] == 'TRUE' else 0.0)
    phone_dict['phone_price'].append(float(line[20]))

In [119]:
chipset_list = sorted(set(phone_dict['phone_chipset']))
phone_dict['phone_chipset'] = [chipset_list.index(phone) for phone in phone_dict['phone_chipset']]

In [120]:
phone_brand = tf.one_hot(phone_dict['phone_brand'], len(brand_list))
phone_weight = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_weight']) / max(phone_dict['phone_weight']), 1)
phone_os = tf.one_hot(phone_dict['phone_os'], len(os_list))
phone_chipset = tf.one_hot(phone_dict['phone_chipset'], len(chipset_list))
phone_memory = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_memory']), 1)  / max(phone_dict['phone_memory'])
phone_ram = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_ram']), 1) / max(phone_dict['phone_ram'])
phone_camera = tf.convert_to_tensor(phone_dict['phone_camera'])
phone_audio_jack = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_audio_jack']), 1)
phone_battery = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_battery']), 1) / max(phone_dict['phone_battery'])
phone_charging = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_charging']), 1)
phone_nfc = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_nfc']), 1)
phone_price = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_price']), 1) / max(phone_dict['phone_price'])

In [121]:
phone = tf.concat([phone_brand, phone_weight, phone_os, phone_chipset, phone_memory, phone_ram, phone_camera, phone_audio_jack, phone_battery, phone_charging, phone_nfc, phone_price], 1)

Phone features are brand, weight, os, chipset, memory, RAM, camera 1, camera 2, camera 3, selfie camera, audio jack, battery, charging, nfc, and price.

In [11]:
phone[0]

<tf.Tensor: shape=(75,), dtype=float32, numpy=
array([ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.33643124,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.5       ,
        0.5      

Create a rating dictionary and a rating list.

In [12]:
rating_dict = {}

In [13]:
rating = [[0.0 for _ in range(1507)] for _ in range(96)]

In [14]:
rating[0][0]

0.0

Access rating dataset from csv file and only save the ratings from the users with information.

In [15]:
with open('rating_dataset_cleaned.csv', 'r') as file:
  csvFile = csv.reader(file)
  for i,line in enumerate(csvFile):
    for j,r in enumerate(line):
      if j in user_index:
        rating[i][user_index.index(j)] = float(r)

In [16]:
print(rating[0][0])
print(rating[4][1])
print(rating[9][0])

0.0
0.0
0.0


In [17]:
phone_data = []
user_data = []
rating_data = []

Save the phone data and user data for training only for the rated phone.

In [18]:
for i,p in enumerate(phone):
  for j,u in enumerate(user):
    if rating[i][j] > 0:
      phone_data.append(phone[i])
      user_data.append(user[j])
      rating_data.append(rating[i][j])

In [19]:
len(phone_data)

5439

Convert phone data, user data, and rating data to tensors and shuffle them.

In [20]:
combined_dataset = list(zip(phone_data, user_data, rating_data))
random.shuffle(combined_dataset)
phone_dataset, user_dataset, rating_dataset = zip(*combined_dataset)
phone_dataset, user_dataset, rating_dataset = list(phone_dataset), list(user_dataset), list(rating_dataset)

In [21]:
phone_dataset = tf.convert_to_tensor(phone_dataset)
user_dataset = tf.convert_to_tensor(user_dataset)
rating_dataset = tf.convert_to_tensor(rating_dataset)

Distribute data into training and validation set.

In [22]:
train_phone = phone_dataset[int(len(phone_dataset)/4):]
train_user = user_dataset[int(len(user_dataset)/4):]
train_rating = rating_dataset[int(len(rating_dataset)/4):]

In [23]:
val_phone = phone_dataset[:int(len(phone_dataset)/4)]
val_user = user_dataset[:int(len(user_dataset)/4)]
val_rating = rating_dataset[:int(len(rating_dataset)/4)]

Create a model with two inputs for user data and phone data, the output is the dot product of the two vectors from user_NN and phone_NN which is the predicted rating.

In [24]:
num_outputs = 128

user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(2048, activation='relu'),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu')
])

phone_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(2048, activation='relu'),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu')
])

user_input = tf.keras.layers.Input(shape=(17))
user_vector = user_NN(user_input)
normalized_user = tf.linalg.l2_normalize(user_vector, axis=1)

phone_input = tf.keras.layers.Input(shape=(75))
phone_vector = phone_NN(phone_input)
normalized_phone = tf.linalg.l2_normalize(phone_vector, axis=1)

output = tf.keras.layers.Dot(axes=1)([normalized_user,normalized_phone])
output = tf.keras.layers.Lambda(lambda x: x * 5)(output)

model = tf.keras.models.Model(inputs=[user_input,phone_input], outputs=output)
user_model = tf.keras.models.Model(inputs=user_input, outputs=normalized_user)
phone_model = tf.keras.models.Model(inputs=phone_input, outputs=normalized_phone)

In [25]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 17)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 75)]                 0         []                            
                                                                                                  
 sequential (Sequential)     (None, 128)                  2824064   ['input_1[0][0]']             
                                                                                                  
 sequential_1 (Sequential)   (None, 128)                  2942848   ['input_2[0][0]']             
                                                                                              

In [26]:
user_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 17)]              0         
                                                                 
 sequential (Sequential)     (None, 128)               2824064   
                                                                 
 tf.math.l2_normalize (TFOp  (None, 128)               0         
 Lambda)                                                         
                                                                 
Total params: 2824064 (10.77 MB)
Trainable params: 2824064 (10.77 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
phone_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 75)]              0         
                                                                 
 sequential_1 (Sequential)   (None, 128)               2942848   
                                                                 
 tf.math.l2_normalize_1 (TF  (None, 128)               0         
 OpLambda)                                                       
                                                                 
Total params: 2942848 (11.23 MB)
Trainable params: 2942848 (11.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mse')

Train the model.

In [29]:
model.fit([train_user, train_phone], train_rating, epochs=15, validation_data=([val_user, val_phone], val_rating))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7f81c7d83160>

In [30]:
model.evaluate([val_user, val_phone], val_rating)



0.297928124666214

In [31]:
model.save('model.keras')

In [32]:
user_model.save('user_model.keras')

In [33]:
phone_model.save('phone_model.keras')

Predict rating.

In [34]:
model.predict([tf.convert_to_tensor([user_data[5]]), tf.convert_to_tensor([phone_data[5]])])



array([[4.388136]], dtype=float32)

In [35]:
rating_data[5]

3.0

In [36]:
user_data[5]

<tf.Tensor: shape=(17,), dtype=float32, numpy=
array([4., 5., 5., 4., 5., 5., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>

Predict rating by dot product of user vector and phone vector.

In [37]:
user_vector_pred = user_model.predict(tf.expand_dims(user_data[5], 0))
phone_vector_pred = phone_model.predict(tf.expand_dims(phone_data[5], 0))
tf.tensordot(user_vector_pred, phone_vector_pred, axes=(1,1)) * 5



<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[4.388136]], dtype=float32)>

Preprocess data into input compatible with the model.

In [38]:
def preprocess_user(x):
  one_hot = x[-1]
  one_hot = tf.one_hot(brand_choice_list.index(one_hot), len(brand_choice_list))
  x = tf.cast(tf.convert_to_tensor(x[:-1]), tf.float32)
  return tf.expand_dims(tf.concat([x, one_hot], 0), 0)

In [39]:
a = [5,5,5,5,5,5,'Infinix'] # performance, camera, battery, software, RAM, storage, favorite brand
preprocessed_output = preprocess_user(a)

Create a user vector.

In [40]:
u_vector = user_model.predict(preprocessed_output)



In [41]:
u_vector.shape

(1, 128)

Create phone vectors.

In [42]:
p_vector = phone_model.predict(phone)



In [43]:
p_vector.shape

(96, 128)

Save phone vectors into csv file.

In [128]:
with open('phone_vector.csv', 'w') as file:
  csvwriter = csv.writer(file)
  csvwriter.writerows(p_vector)

Predict phone ratings for a user.

In [44]:
rating_pred_list = []
for p in p_vector:
  rating_pred_list.append(tf.tensordot(u_vector, tf.expand_dims(p, 0), axes=(1,1)) * 5)

Get phone name.

In [45]:
phone_pred_list = phone_dict['phone_name']

Sort ratings to get top 10 recommended phones for a user.

In [46]:
top_10_phone = list(zip(rating_pred_list, phone_pred_list))
top_10_phone.sort(reverse=True)
rating_pred_list, phone_pred_list = zip(*top_10_phone)
rating_pred_list, phone_pred_list = list(rating_pred_list), list(phone_pred_list)

Print out top 10 recommended phones.

In [47]:
top_10_list = []
for i in range(10):
  top_10_list.append(phone_pred_list[i])
  print(i+1, phone_pred_list[i], rating_pred_list[i].numpy())

1 Oppo Find N3 [[4.422674]]
2 Samsung Galaxy Z Fold5 [[4.4226365]]
3 Xiaomi 14 [[4.422406]]
4 Oppo Find X7 [[4.4222746]]
5 Huawei Pura 70 Ultra [[4.422265]]
6 iPhone 15 Pro Max [[4.4222627]]
7 iPhone 15 Pro [[4.422254]]
8 iPhone 14 Pro [[4.4221883]]
9 Vivo V30 Pro [[4.4221864]]
10 iPhone 14 Pro Max [[4.4221835]]


In [48]:
top_10_list

['Oppo Find N3',
 'Samsung Galaxy Z Fold5',
 'Xiaomi 14',
 'Oppo Find X7',
 'Huawei Pura 70 Ultra',
 'iPhone 15 Pro Max',
 'iPhone 15 Pro',
 'iPhone 14 Pro',
 'Vivo V30 Pro',
 'iPhone 14 Pro Max']

Calculate square distance between phone vectors to get related phones.

In [100]:
top_distance = []
top_distance_name = []
for i,x in enumerate(phone):
  all_distance = []
  phone_pred_list = phone_dict['phone_name']
  for j,y in enumerate(phone):
    if tf.math.reduce_all(tf.math.equal(x,y)):
      continue
    else:
      all_distance.append(tf.math.reduce_sum(tf.math.square(x-y)))
  phone_pred_list = phone_pred_list[:i] + phone_pred_list[i+1:]
  combined_dist = list(zip(all_distance, phone_pred_list))
  combined_dist.sort()
  all_distance, phone_pred_list = zip(*combined_dist)
  all_distance, phone_pred_list = list(all_distance), list(phone_pred_list)
  top_distance.append(all_distance[:10])
  top_distance_name.append(phone_pred_list[:10])

Save related phones data into csv file.

In [126]:
with open('related_phone.csv', 'w') as file:
  csvwriter = csv.writer(file)
  csvwriter.writerow(['phone_name', 'related1', 'related2', 'related3', 'related4', 'related5', 'related6', 'related7', 'related8', 'related9', 'related10'])
  for i in range(96):
    csvwriter.writerow([phone_dict['phone_name'][i], top_distance_name[i][0], top_distance_name[i][1], top_distance_name[i][2], top_distance_name[i][3], top_distance_name[i][4], top_distance_name[i][5], top_distance_name[i][6], top_distance_name[i][7], top_distance_name[i][8], top_distance_name[i][9]])