###Libraries Installation and Import

In [None]:
!pip install -q tensorflow-recommenders

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m61.4/96.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from typing import Dict, Text, List, Optional, Any

###Data Preparation

In [None]:
with open('/content/merged_ta_data.json', 'r') as file:
    ta_data = json.load(file)

In [None]:
user_data = pd.read_csv('/content/user_with_preferences.csv')
hotel_data = pd.read_csv('/content/merged_hotels.csv')

scaler = StandardScaler()
hotel_data[['rating']] = scaler.fit_transform(hotel_data[['rating']])

In [None]:
mlb = MultiLabelBinarizer()
user_data['Preferences'] = user_data['Preferences'].apply(eval)
preferences_encoded = mlb.fit_transform(user_data['Preferences'])
preferences_df = pd.DataFrame(preferences_encoded, columns=mlb.classes_)
user_data = pd.concat([user_data, preferences_df], axis=1)

###Function



In [None]:
def calculate_middle_point(tour_interests):
    if not tour_interests:
        return {'lat': None, 'lng': None}
    lat_sum = sum(point['lat'] for point in tour_interests)
    lng_sum = sum(point['lng'] for point in tour_interests)
    middle_lat = lat_sum / len(tour_interests)
    middle_lng = lng_sum / len(tour_interests)
    return {'user_lat': middle_lat, 'user_lng': middle_lng}

ta_dict = {item['name']: {'lat': item['lat'], 'lng': item['lng']} for item in ta_data}
hotel_dict = {row['name']: {'lat': row['lat'], 'lng': row['lng']} for _, row in hotel_data.iterrows()}

def dptin_kordinat(name: str, data_dict: Dict[str, Dict[str, float]]) -> Optional[Dict[str, float]]:
  return data_dict.get(name)

destinasi_1 = dptin_kordinat("Air Panas Semurup", ta_dict)
destinasi_2 = dptin_kordinat("Tebat Air Koto Majidin", ta_dict)
destinasi_3 = dptin_kordinat("Air Terjun Pendung Mudik", ta_dict)
destinasi_4 = dptin_kordinat("Kebun jeruk arumi&hanum", ta_dict)

In [None]:
tour_interests = [destinasi_1, destinasi_2, destinasi_3, destinasi_4]
middle_point = calculate_middle_point(tour_interests)
print(f"Middle Point: {middle_point}")

Middle Point: {'user_lat': -1.9904434000000002, 'user_lng': 101.37182865}


In [None]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

# setiap ganti lat/lng perlu run ginian
def distances(middle_point: Dict[str, float], data: pd.DataFrame) -> pd.DataFrame:
    data['distance'] = data.apply(lambda row: haversine(middle_point['user_lat'], middle_point['user_lng'], row['lat'], row['lng']), axis=1)
    return data

def normalize_distances(data: pd.DataFrame) -> pd.DataFrame:
    distances = data['distance'].values.reshape(-1, 1)
    scaler = StandardScaler()
    data['distance_normalized'] = scaler.fit_transform(distances)
    return data

hotel_data = distances(middle_point, hotel_data)
hotel_data = normalize_distances(hotel_data)

#Model and training

In [None]:
class RankingModel(tfrs.Model):
    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        unique_hotel_names = hotel_data["name"].astype(str).unique()
        self.hotel_name_lookup = tf.keras.layers.StringLookup(vocabulary=unique_hotel_names, mask_token=None)

        self.hotel_embeddings = tf.keras.Sequential([
            self.hotel_name_lookup,
            tf.keras.layers.Embedding(len(unique_hotel_names) + 1, embedding_dimension)
        ])

        # ini berubah / perlu copas
        self.user_preference_embeddings = tf.keras.layers.Embedding(len(mlb.classes_) + 1, embedding_dimension)

        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(1)
        ])

        self.task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        hotel_embeddings = self.hotel_embeddings(features["hotel_name"])
        distance_normalized_expanded = tf.expand_dims(features["distance_normalized"], axis=1)
        rating_expanded = tf.expand_dims(features["rating"], axis=1)
        user_preferences = tf.reduce_sum(self.user_preference_embeddings(features["preferences"]), axis=1)
        concatenated = tf.concat([hotel_embeddings, distance_normalized_expanded, rating_expanded, user_preferences], axis=1)
        return self.rating_model(concatenated)

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        features, labels = features
        predictions = self(features)
        return self.task(labels=labels, predictions=predictions)

In [None]:
repeated_preferences = np.tile(user_data[mlb.classes_].values, (int(np.ceil(len(hotel_data) / len(user_data))), 1))[:len(hotel_data)]

In [None]:
hotel_df = pd.DataFrame(hotel_data)

ratings = hotel_df['rating'].values.reshape(-1, 1)
distances = hotel_df['distance_normalized'].values.reshape(-1, 1)
preferences = user_data[mlb.classes_].values

ranking_scores = ratings * 0.3 + -distances * 0.4 + preferences.sum(axis=1).reshape(1, -1) * 0.3
ranking_scores = ranking_scores.flatten()

In [None]:
combined_data = pd.DataFrame({
    "hotel_name": hotel_data["name"],
    "distance_normalized": hotel_data["distance_normalized"],
    "rating": hotel_data["rating"],
    "preferences": list(repeated_preferences),
    "ranking_score": hotel_data['rating'] * 0.35 + hotel_data['distance_normalized'] * -0.45 + repeated_preferences.sum(axis=1) * 0.2  # example ranking score
})

train_data, val_data = train_test_split(combined_data, test_size=0.2, random_state=42)

def df_to_dataset(dataframe, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('ranking_score')
    dataframe = {key: np.stack(value) for key, value in dataframe.items()}
    ds = tf.data.Dataset.from_tensor_slices((dataframe, labels))
    ds = ds.batch(batch_size)
    return ds

train_dataset = df_to_dataset(train_data)
val_dataset = df_to_dataset(val_data)

In [None]:
model = RankingModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1), run_eagerly=True)

In [None]:
# Train the model
model.fit(train_dataset, validation_data=val_dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20

KeyboardInterrupt: 

In [None]:
def rank_hotels(user_id, top_n):
    user_preferences = user_data.loc[user_data['User_Id'] == user_id, mlb.classes_].values.flatten()
    hotel_data_input = {
        "hotel_name": tf.constant(hotel_data["name"].astype(str).tolist()),
        "distance_normalized": tf.constant(hotel_data["distance_normalized"].tolist(), dtype=tf.float32),
        "rating": tf.constant(hotel_data["rating"].tolist(), dtype=tf.float32),
        "preferences": tf.constant([user_preferences] * len(hotel_data), dtype=tf.float32)
    }
    predictions = model(hotel_data_input)
    hotel_data["predicted_ranking_score"] = predictions.numpy().flatten()
    ranked_hotels = hotel_data.sort_values(by="predicted_ranking_score", ascending=False).head(top_n)
    return ranked_hotels[["name", "formatted_address", "distance", "rating", "predicted_ranking_score", "photos"]]


# ini kalau ingin bikin preference kalian sendiri
def another_rank_hotels(preference:List[int], top_n=5):

    user_preferences = input('Masukin array []')

    hotel_data = {
        "hotel_name": tf.constant(hotel_data["name"].tolist()),
        "distance_normalized": tf.constant(hotel_data["distance_normalized"].tolist(), dtype=tf.float32),
        "rating": tf.constant(hotel_data["rating"].tolist(), dtype=tf.float32),
        "preferences": tf.constant([user_preferences] * len(hotel_data), dtype=tf.float32)
    }
    predictions = model(hotel_data)
    hotel_data["predicted_ranking_score"] = predictions.numpy().flatten()
    ranked_hotels = hotel_data.sort_values(by="predicted_ranking_score", ascending=False).head(top_n)
    return ranked_hotels[["name", "formatted_address", "distance", "rating", "predicted_ranking_score", "photos"]]

In [None]:
user_id = 4
print(middle_point)

top_hotels = rank_hotels(user_id, top_n=15)
print(top_hotels)

{'user_lat': -1.9904434000000002, 'user_lng': 101.37182865}
                                                    name  \
24333                                        Rumah hitam   
26166                                       Villa Vasily   
24334                                  RUMAH SRY WAHYUNI   
24342  Homestay 007 Saribu Rumah Gadang/Penginapan So...   
26161  Homestay 007 Saribu Rumah Gadang/Penginapan So...   
24228                                      Rumah Barulih   
24335                                              Hotel   
24230                                   Penginapan Ni Ir   
24331                                        St Mangkuto   
24324                                Rumah Asri Mr Black   
24336                                        Mbah wasidi   
14332                Homestay permata sari(tempat ginap)   
24321                                 Rumah Mertua Isell   
24217                                  Penginapan Aliyah   
24278                             De Que

#Save and Export

In [None]:
model.save_weights('model_weights_stdrecommdend.h5')

In [None]:
with open('model_architecture.json', 'w') as f:
    f.write(model.to_json())