# Recommendation System using Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, recall_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from google.colab import files

print("Please upload your kaggle.json file.")
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Please upload your kaggle.json file.


Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 71 bytes


In [None]:
! cp ./kaggle.json ~/.kaggle/
! kaggle datasets download -d aprabowo/indonesia-tourism-destination
! unzip indonesia-tourism-destination.zip

Downloading indonesia-tourism-destination.zip to /content
100% 158k/158k [00:00<00:00, 366kB/s]
100% 158k/158k [00:00<00:00, 366kB/s]
Archive:  indonesia-tourism-destination.zip
  inflating: package_tourism.csv     
  inflating: tourism_rating.csv      
  inflating: tourism_with_id.csv     
  inflating: user.csv                


In [None]:
tourism_df = pd.read_csv("./tourism_with_id.csv")
rating_df = pd.read_csv("./tourism_rating.csv")

In [None]:
tourism_df.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long,Unnamed: 11,Unnamed: 12
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-6.175392,106.827153,,1
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-6.137645,106.817125,,2
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-6.125312,106.833538,,3
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-6.302446,106.895156,,4
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-6.12419,106.839134,,5


In [None]:
rating_df.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4


Developing Naive Bayes Algorithm

In [None]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the dataset
data = rating_df

# Convert user IDs and movie IDs to strings
data['User_Id'] = data['User_Id'].astype(str)
data['Place_Id'] = data['Place_Id'].astype(str)

# Convert the ratings to discrete values
data['rating'] = data['Place_Ratings']

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

# Train the Multinomial Naive Bayes model
nb = CategoricalNB(alpha = 1)
nb.fit(train_data[["User_Id", "Place_Id"]], train_data['rating'])

# Make predictions on the test set
predictions = nb.predict(test_data[["User_Id", "Place_Id"]])

# Compute and print the mean squared error and mean absolute error of the predictions
mae = mean_absolute_error(test_data['rating'], predictions)
print(f'MAE: {mae}')

# Compute and print the mean squared error and mean absolute error of the predictions
RMSE = np.sqrt(mean_squared_error(test_data['rating'], predictions))
print(f'RMSE: {RMSE}')

print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1-Score: {my_f1_score}")
print("Classification Report:")
print(classification_report(test_data['rating'], predictions, zero_division=1))

MAE: 1.465
RMSE: 1.8726985876002578
Recall: 0.234
Precision: 0.23513186028093994
F1-Score: 0.23386515102459499
Classification Report:
              precision    recall  f1-score   support

           1       0.23      0.24      0.24       156
           2       0.23      0.25      0.24       208
           3       0.25      0.22      0.23       223
           4       0.24      0.21      0.23       224
           5       0.22      0.25      0.23       189

    accuracy                           0.23      1000
   macro avg       0.23      0.24      0.23      1000
weighted avg       0.24      0.23      0.23      1000



In [None]:
import warnings
warnings.filterwarnings("ignore")

# print(sklearn.metrics.get_scorer_names())

scoring = ['neg_mean_absolute_error', 'neg_root_mean_squared_error']
scores = cross_validate(nb, train_data[["User_Id", "Place_Id"]], train_data['rating'], scoring = scoring, cv = 10,)

scores

{'fit_time': array([0.00973129, 0.0090301 , 0.00714731, 0.00703645, 0.00710511,
        0.00820422, 0.01199675, 0.00928116, 0.00701785, 0.00696015]),
 'score_time': array([0.00229478, 0.00238538, 0.00224352, 0.00240636, 0.00230598,
        0.00253582, 0.00377512, 0.00242114, 0.00223804, 0.00221539]),
 'test_neg_mean_absolute_error': array([-1.54777778, -1.62222222, -1.50777778, -1.54555556, -1.55444444,
        -1.53777778, -1.56111111, -1.53777778, -1.48555556, -1.49777778]),
 'test_neg_root_mean_squared_error': array([-1.92151561, -2.00277585, -1.91688405, -1.93649167, -1.93591781,
        -1.93103311, -1.96440548, -1.9350567 , -1.89765938, -1.91601438])}

In [None]:
print(-(scores['test_neg_mean_absolute_error'].mean()))
print(-(scores['test_neg_root_mean_squared_error'].mean()))
print(scores['test_neg_mean_absolute_error'].std())
print(scores['test_neg_root_mean_squared_error'].std())

1.5397777777777777
1.935775404453059
0.03647018061477175
0.027813369349929035
