# Content-based Filtering Model

In [1]:
import csv
import random
import tensorflow as tf

Create a list of brand choice for one hot encoding.

In [2]:
user_dict = {}
brand_choice_list = ['Oppo', 'Asus', 'Infinix', 'Samsung', 'Vivo', 'Huawei', 'Apple', 'Realme', 'Xiaomi', 'Poco', 'lainnya/tidak ada']

Access user dataset from csv file.

In [3]:
with open('user_dataset_revised.csv', 'r') as file:
  user_dict['user'] = []
  user_dict['user_fav'] = []
  user_index = []
  csvFile = csv.reader(file)
  for i,line in enumerate(csvFile):
    if '0' in line:
      continue
    else:
      user_index.append(i)
      user_dict[f'user'].append([int(line[0]), int(line[1]), int(line[2]), int(line[3]), int(line[4]), int(line[5])])
      user_dict[f'user_fav'].append(brand_choice_list.index(line[7]))

Convert user features to tensors and one hot encode one of user features, and then concatenate them.

In [4]:
user_feature1_6 = tf.cast(tf.convert_to_tensor(user_dict['user']), tf.float32)
user_feature7 = tf.one_hot(user_dict['user_fav'], len(brand_choice_list))
user = tf.concat([user_feature1_6, user_feature7], 1)

User features are performance, camera, battery, software, RAM, storage, and favorite brand.

In [5]:
user[0]

<tf.Tensor: shape=(17,), dtype=float32, numpy=
array([1., 1., 5., 1., 3., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>

Create a brand list and os list for one hot encoding.

In [6]:
phone_dict = {}
brand_list = ['Oppo', 'Asus', 'Infinix', 'Samsung', 'Vivo', 'Huawei', 'Apple', 'Realme', 'Xiaomi', 'Poco']
os_list = ['iOS', 'EMUI', 'Android']
chipset_list = ['unisoc', 'helio','dimensity_6', 'dimensity_7', 'dimensity_8', 'dimensity_9', 'snapdragon_6', 'snapdragon_7', 'snapdragon_8', 'exynos_1', 'exynos_2', 'kirin', 'apple']
price_list = ['_2', '2_3', '3_4', '4_6', '6_8', '8_10', '10_']

Access phone dataset from csv file.<br>
Preprocess the data,  one hot encode some of the features, normalize some of the features.<br>
Then concatenate all the phone data.

In [7]:
with open('phone_dataset_cleaned.csv', 'r') as file:
  phone_dict = {'phone_brand': [], 'phone_name': [], 'phone_weight': [], 'phone_os': [], 'phone_chipset': [], 'phone_memory': [], 'phone_ram': [], 'phone_camera': [], 'phone_audio_jack': [], 'phone_battery': [], 'phone_charging': [], 'phone_nfc': [], 'phone_price': []}
  csvFile = csv.reader(file)
  next(csvFile)
  for i,line in enumerate(csvFile):
    phone_dict['phone_brand'].append(brand_list.index(line[1]))
    phone_dict['phone_name'].append(line[2])
    phone_dict['phone_weight'].append(float(line[6]))
    phone_dict['phone_os'].append(os_list.index('iOS' if line[7].startswith('iOS') else 'EMUI' if line[7].startswith('EMUI') else 'Android'))
    phone_dict['phone_chipset'].append(chipset_list.index('unisoc' if 'unisoc' in line[8].lower()
                                                          else 'helio' if 'helio' in line[8].lower()
                                                          else 'dimensity_6' if 'dimensity 6' in line[8].lower()
                                                          else 'dimensity_7' if 'dimensity 7' in line[8].lower()
                                                          else 'dimensity_8' if 'dimensity 8' in line[8].lower()
                                                          else 'dimensity_9' if 'dimensity 9' in line[8].lower()
                                                          else 'snapdragon_6' if 'snapdragon 6' in line[8].lower()
                                                          else 'snapdragon_7' if 'snapdragon 7' in line[8].lower()
                                                          else 'snapdragon_8' if 'snapdragon 8' in line[8].lower()
                                                          else 'exynos_1' if 'exynos 1' in line[8].lower()
                                                          else 'exynos_2' if 'exynos 2' in line[8].lower()
                                                          else 'kirin' if 'kirin' in line[8].lower()
                                                          else 'apple' if 'apple' in line[8].lower() else ''))
    phone_dict['phone_memory'].append(float(line[9]))
    phone_dict['phone_ram'].append(float(line[10]))
    phone_dict['phone_camera'].append([float(line[11]), float(line[12]), float(line[13]), float(line[14])])
    phone_dict['phone_audio_jack'].append(1.0 if line[15] == 'TRUE' else 0.0)
    phone_dict['phone_battery'].append(float(line[16]))
    phone_dict['phone_charging'].append(float(line[17]))
    phone_dict['phone_nfc'].append(1.0 if line[19] == 'TRUE' else 0.0)
    phone_dict['phone_price'].append(price_list.index('_2' if int(line[20]) < 2000000
                                                      else '2_3' if int(line[20]) < 3000000
                                                      else '3_4' if int(line[20]) < 4000000
                                                      else '4_6' if int(line[20]) < 6000000
                                                      else '6_8' if int(line[20]) < 8000000
                                                      else '8_10' if int(line[20]) < 10000000
                                                      else '10_'))

In [8]:
phone_brand = tf.one_hot(phone_dict['phone_brand'], len(brand_list))
phone_weight = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_weight']) / max(phone_dict['phone_weight']), 1)
phone_os = tf.one_hot(phone_dict['phone_os'], len(os_list))
phone_chipset = tf.one_hot(phone_dict['phone_chipset'], len(chipset_list))
phone_memory = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_memory']), 1)  / max(phone_dict['phone_memory'])
phone_ram = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_ram']), 1) / max(phone_dict['phone_ram'])
phone_camera = tf.convert_to_tensor(phone_dict['phone_camera'])
phone_audio_jack = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_audio_jack']), 1)
phone_battery = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_battery']), 1) / max(phone_dict['phone_battery'])
phone_charging = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_charging']), 1)
phone_nfc = tf.expand_dims(tf.convert_to_tensor(phone_dict['phone_nfc']), 1)
phone_price = tf.one_hot(phone_dict['phone_price'], len(price_list))

In [9]:
phone = tf.concat([phone_brand, phone_weight, phone_os, phone_chipset, phone_memory, phone_ram, phone_camera, phone_audio_jack, phone_battery, phone_charging, phone_nfc, phone_price], 1)

Phone features are brand, weight, os, chipset, memory, RAM, camera 1, camera 2, camera 3, selfie camera, audio jack, battery, charging, nfc, and price.

In [10]:
phone[0]

<tf.Tensor: shape=(44,), dtype=float32, numpy=
array([ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.33643124,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.5       ,  0.5       , 50.        ,
       32.        ,  8.        , 32.        ,  1.        ,  0.575     ,
       80.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ], dtype=float32)>

Create a rating dictionary and a rating list.

In [11]:
rating_dict = {}

In [12]:
rating = [[0.0 for _ in range(1507)] for _ in range(96)]

In [13]:
rating[0][0]

0.0

Access rating dataset from csv file and only save the ratings from the users with information.

In [14]:
with open('rating_dataset_cleaned.csv', 'r') as file:
  csvFile = csv.reader(file)
  for i,line in enumerate(csvFile):
    for j,r in enumerate(line):
      if j in user_index:
        rating[i][user_index.index(j)] = float(r)

In [15]:
print(rating[0][0])
print(rating[4][1])
print(rating[9][0])

0.0
0.0
0.0


In [16]:
phone_data = []
user_data = []
rating_data = []

Save the phone data and user data for training only for the rated phone.

In [17]:
for i,p in enumerate(phone):
  for j,u in enumerate(user):
    if rating[i][j] > 0:
      phone_data.append(phone[i])
      user_data.append(user[j])
      rating_data.append(rating[i][j])

In [18]:
len(phone_data)

5439

Convert phone data, user data, and rating data to tensors and shuffle them.

In [19]:
combined_dataset = list(zip(phone_data, user_data, rating_data))
random.shuffle(combined_dataset)
phone_dataset, user_dataset, rating_dataset = zip(*combined_dataset)
phone_dataset, user_dataset, rating_dataset = list(phone_dataset), list(user_dataset), list(rating_dataset)

In [20]:
phone_dataset = tf.convert_to_tensor(phone_dataset)
user_dataset = tf.convert_to_tensor(user_dataset)
rating_dataset = tf.convert_to_tensor(rating_dataset)

Distribute data into training and validation set.

In [21]:
train_phone = phone_dataset[int(len(phone_dataset)/4):]
train_user = user_dataset[int(len(user_dataset)/4):]
train_rating = rating_dataset[int(len(rating_dataset)/4):]

In [22]:
val_phone = phone_dataset[:int(len(phone_dataset)/4)]
val_user = user_dataset[:int(len(user_dataset)/4)]
val_rating = rating_dataset[:int(len(rating_dataset)/4)]

Create a model with two inputs for user data and phone data, the output is the dot product of the two vectors from user_NN and phone_NN which is the predicted rating.

In [23]:
num_outputs = 32

user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

phone_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

user_input = tf.keras.layers.Input(shape=(17))
user_vector = user_NN(user_input)
normalized_user = tf.linalg.l2_normalize(user_vector, axis=1)

phone_input = tf.keras.layers.Input(shape=(44))
phone_vector = phone_NN(phone_input)
normalized_phone = tf.linalg.l2_normalize(phone_vector, axis=1)

output = tf.keras.layers.Dot(axes=1)([normalized_user,normalized_phone])
output = tf.keras.layers.Lambda(lambda x: x * 5)(output)

model = tf.keras.models.Model(inputs=[user_input,phone_input], outputs=output)
user_model = tf.keras.models.Model(inputs=user_input, outputs=normalized_user)
phone_model = tf.keras.models.Model(inputs=phone_input, outputs=normalized_phone)

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 17)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 44)]                 0         []                            
                                                                                                  
 sequential (Sequential)     (None, 32)                   41632     ['input_1[0][0]']             
                                                                                                  
 sequential_1 (Sequential)   (None, 32)                   48544     ['input_2[0][0]']             
                                                                                              

In [25]:
user_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 17)]              0         
                                                                 
 sequential (Sequential)     (None, 32)                41632     
                                                                 
 tf.math.l2_normalize (TFOp  (None, 32)                0         
 Lambda)                                                         
                                                                 
Total params: 41632 (162.62 KB)
Trainable params: 41632 (162.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
phone_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 44)]              0         
                                                                 
 sequential_1 (Sequential)   (None, 32)                48544     
                                                                 
 tf.math.l2_normalize_1 (TF  (None, 32)                0         
 OpLambda)                                                       
                                                                 
Total params: 48544 (189.62 KB)
Trainable params: 48544 (189.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mse')

Train the model.

In [53]:
model.fit([train_user, train_phone], train_rating, epochs=30, validation_data=([val_user, val_phone], val_rating))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x795fec527370>

In [54]:
model.evaluate([val_user, val_phone], val_rating)



0.2931220531463623

In [55]:
model.save('model.keras')

In [56]:
user_model.save('user_model.keras')

In [57]:
phone_model.save('phone_model.keras')

Predict rating.

In [58]:
model.predict([tf.convert_to_tensor([user_data[5]]), tf.convert_to_tensor([phone_data[5]])])



array([[4.2162366]], dtype=float32)

In [59]:
rating_data[5]

3.0

In [60]:
user_data[5]

<tf.Tensor: shape=(17,), dtype=float32, numpy=
array([4., 5., 5., 4., 5., 5., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>

Predict rating by dot product of user vector and phone vector.

In [61]:
user_vector_pred = user_model.predict(tf.expand_dims(user_data[5], 0))
phone_vector_pred = phone_model.predict(tf.expand_dims(phone_data[5], 0))
tf.tensordot(user_vector_pred, phone_vector_pred, axes=(1,1)) * 5



<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[4.2162366]], dtype=float32)>

Preprocess data into input compatible with the model.

In [62]:
def preprocess_user(x):
  one_hot = x[-1]
  one_hot = tf.one_hot(brand_choice_list.index(one_hot), len(brand_choice_list))
  x = tf.cast(tf.convert_to_tensor(x[:-1]), tf.float32)
  return tf.expand_dims(tf.concat([x, one_hot], 0), 0)

In [63]:
a = [5,5,5,5,5,5,'Apple'] # performance, camera, battery, software, RAM, storage, favorite brand
preprocessed_output = preprocess_user(a)

Create a user vector.

In [64]:
u_vector = user_model.predict(preprocessed_output)



In [65]:
u_vector.shape

(1, 32)

Create phone vectors.

In [66]:
p_vector = phone_model.predict(phone)



In [67]:
p_vector.shape

(96, 32)

Save phone vectors into csv file.

In [68]:
with open('phone_vector.csv', 'w') as file:
  csvwriter = csv.writer(file)
  csvwriter.writerows(p_vector)

Predict phone ratings for a user.

In [69]:
rating_pred_list = []
for p in p_vector:
  rating_pred_list.append(tf.tensordot(u_vector, tf.expand_dims(p, 0), axes=(1,1)) * 5)

Get phone name.

In [70]:
phone_pred_list = phone_dict['phone_name']

Sort ratings to get top 10 recommended phones for a user.

In [71]:
top_10_phone = list(zip(rating_pred_list, phone_pred_list))
top_10_phone.sort(reverse=True)
rating_pred_list, phone_pred_list = zip(*top_10_phone)
rating_pred_list, phone_pred_list = list(rating_pred_list), list(phone_pred_list)

Print out top 10 recommended phones.

In [72]:
top_10_list = []
for i in range(10):
  top_10_list.append(phone_pred_list[i])
  print(i+1, phone_pred_list[i], rating_pred_list[i].numpy())

1 iPhone 15 Pro [[4.6715217]]
2 iPhone 14 Pro [[4.6715097]]
3 iPhone 15 Pro Max [[4.6714597]]
4 iPhone 14 Pro Max [[4.6714478]]
5 iPhone 13 [[4.671291]]
6 iPhone 14 Plus [[4.66598]]
7 Samsung Galaxy Z Fold5 [[4.6275544]]
8 Samsung Galaxy S24 [[4.627504]]
9 Samsung Galaxy S24 Ultra [[4.6262345]]
10 Samsung Galaxy S24+ [[4.6193595]]


In [73]:
top_10_list

['iPhone 15 Pro',
 'iPhone 14 Pro',
 'iPhone 15 Pro Max',
 'iPhone 14 Pro Max',
 'iPhone 13',
 'iPhone 14 Plus',
 'Samsung Galaxy Z Fold5',
 'Samsung Galaxy S24',
 'Samsung Galaxy S24 Ultra',
 'Samsung Galaxy S24+']

Calculate square distance between phone vectors to get related phones.

In [74]:
top_distance = []
top_distance_name = []
for i,x in enumerate(p_vector):
  all_distance = []
  phone_pred_list = phone_dict['phone_name']
  for j,y in enumerate(p_vector):
    if tf.math.reduce_all(tf.math.equal(x,y)):
      continue
    else:
      all_distance.append(tf.math.reduce_sum(tf.math.square(x-y)))
  phone_pred_list = phone_pred_list[:i] + phone_pred_list[i+1:]
  combined_dist = list(zip(all_distance, phone_pred_list))
  combined_dist.sort()
  all_distance, phone_pred_list = zip(*combined_dist)
  all_distance, phone_pred_list = list(all_distance), list(phone_pred_list)
  top_distance.append(all_distance[:10])
  top_distance_name.append(phone_pred_list[:10])

In [75]:
top_distance[0]

[<tf.Tensor: shape=(), dtype=float32, numpy=0.00025809451>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0012107817>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.001780206>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0019191431>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.004211559>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.004305103>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.006314103>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0064044045>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.008890038>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.009145189>]

In [76]:
top_distance_name[0]

['Oppo Reno 11',
 'Oppo Reno 10',
 'Oppo Reno 10 Pro',
 'Vivo V30',
 'Vivo V30e',
 'Huawei Nova 11 Pro',
 'Vivo V29e',
 'Oppo Reno 11F',
 'Realme 12+ 5G',
 'Huawei Nova 11']

Save related phones data into csv file.

In [77]:
with open('related_phone.csv', 'w') as file:
  csvwriter = csv.writer(file)
  csvwriter.writerow(['phone_name', 'related1', 'related2', 'related3', 'related4', 'related5', 'related6', 'related7', 'related8', 'related9', 'related10'])
  for i in range(96):
    csvwriter.writerow([phone_dict['phone_name'][i], top_distance_name[i][0], top_distance_name[i][1], top_distance_name[i][2], top_distance_name[i][3], top_distance_name[i][4], top_distance_name[i][5], top_distance_name[i][6], top_distance_name[i][7], top_distance_name[i][8], top_distance_name[i][9]])