In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
locations_df = pd.read_csv("C:/Users/Gharat/Downloads/recommendation-engine/offbeat-oasis/data/final/locations.csv")
trips_df = pd.read_csv("C:/Users/Gharat/Downloads/recommendation-engine/offbeat-oasis/data/final/trips.csv")
users_df = pd.read_csv("C:/Users/Gharat/Downloads/recommendation-engine/offbeat-oasis/data/final/users.csv")
reviews_df = pd.read_csv("C:/Users/Gharat/Downloads/recommendation-engine/offbeat-oasis/data/final/reviews.csv")

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

def prepare_location_features(locations_df):
    """
    Combine text features into one column and apply TF-IDF vectorization.
    """
    locations_df['combined_features'] = (
        locations_df['category'].fillna('') + ' ' +
        locations_df['state'].fillna('') + ' ' +
        locations_df['activities'].fillna('') + ' ' +
        locations_df['places'].fillna('')
    )

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(locations_df['combined_features'])

    return tfidf_matrix, tfidf

from sklearn.metrics.pairwise import cosine_similarity

def create_user_preference_vector(travel_category, preferred_state, tfidf):
    """
    Convert user preferences into the same TF-IDF space as the locations.
    """
    user_text = f"{travel_category} {preferred_state}"
    user_vector = tfidf.transform([user_text])
    return user_vector

def get_content_based_recommendations(user_vector, tfidf_matrix, locations_df, top_n=10):
    """
    Calculate cosine similarity and return top N location recommendations.
    """
    similarity_scores = cosine_similarity(user_vector, tfidf_matrix).flatten()
    locations_df['content_score'] = similarity_scores

    top_locations = locations_df.sort_values(by='content_score', ascending=False).head(top_n)
    return top_locations[['location_id', 'location_name', 'content_score']]


def create_user_location_matrix(reviews_df):
    """
    Pivot reviews into a user-location interaction matrix.
    """
    interaction_matrix = reviews_df.pivot_table(
        index='user_id',
        columns='location_id',
        values='rating'
    ).fillna(0)
    
    return interaction_matrix

from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_similar_users(interaction_matrix, target_user_id, k=5):
    """
    Find top-K similar users to the given user_id.
    """
    if target_user_id not in interaction_matrix.index:
        return None  # Handle cold-start later

    user_vector = interaction_matrix.loc[[target_user_id]]
    similarity_matrix = cosine_similarity(user_vector, interaction_matrix)[0]
    
    similarity_series = pd.Series(similarity_matrix, index=interaction_matrix.index)
    similarity_series = similarity_series.drop(target_user_id).sort_values(ascending=False).head(k)

    return similarity_series

def predict_ratings_for_user(interaction_matrix, similar_users, target_user_id):
    """
    Predict ratings for all locations not rated by the target user using neighbors.
    """
    if similar_users is None:
        return pd.Series()  # Cold-start

    neighbors_matrix = interaction_matrix.loc[similar_users.index]
    user_ratings = interaction_matrix.loc[target_user_id]

    weighted_ratings = neighbors_matrix.T.dot(similar_users)
    normalization = similar_users.sum()

    prediction_scores = weighted_ratings / normalization
    unseen_locations = user_ratings[user_ratings == 0].index

    return prediction_scores[unseen_locations].sort_values(ascending=False)



In [6]:
# Data handling
import pandas as pd
import numpy as np

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Similarity calculation
from sklearn.metrics.pairwise import cosine_similarity

# Optional: To suppress SettingWithCopyWarning
import warnings
warnings.filterwarnings('ignore')


In [7]:
def estimate_location_cost(reviews_df, trips_df):
    """
    Estimate the cost of each location by averaging the trip cost of users who visited it.
    """
    merged_df = reviews_df.merge(trips_df, on='user_id', how='left')
    location_costs = merged_df.groupby('location_id')['cost'].mean().reset_index()
    location_costs.columns = ['location_id', 'estimated_cost']
    return location_costs


In [8]:
def apply_budget_filter(location_df, location_costs, trip_budget):
    """
    Merge estimated costs with location recommendations and filter by budget.
    """
    merged = location_df.merge(location_costs, on='location_id', how='left')
    filtered = merged[merged['estimated_cost'] <= trip_budget]
    return filtered.sort_values(by='estimated_cost')


In [9]:
def normalize_scores(series):
    """
    Normalize a pandas Series between 0 and 1.
    """
    return (series - series.min()) / (series.max() - series.min() + 1e-9)


In [10]:
def combine_scores(content_df, collab_scores, weight_content=0.5, weight_collab=0.5):
    """
    Combine normalized content-based and collaborative filtering scores.
    """
    content_df = content_df.copy()
    content_df['normalized_content'] = normalize_scores(content_df['content_score'])

    collab_df = collab_scores.reset_index()
    collab_df.columns = ['location_id', 'collab_score']
    collab_df['normalized_collab'] = normalize_scores(collab_df['collab_score'])

    merged = content_df.merge(collab_df, on='location_id', how='left')
    merged['normalized_collab'] = merged['normalized_collab'].fillna(0)

    merged['hybrid_score'] = (
        weight_content * merged['normalized_content'] +
        weight_collab * merged['normalized_collab']
    )

    return merged.sort_values(by='hybrid_score', ascending=False)


In [11]:
# Inputs
travel_category = "Nature"
preferred_state = "Himachal Pradesh"
trip_budget = 40000
target_user_id = 101  # Assume this user exists in the dataset

# STEP 1: Content-Based Filtering
tfidf_matrix, tfidf = prepare_location_features(locations_df)
user_vector = create_user_preference_vector(travel_category, preferred_state, tfidf)
content_recs = get_content_based_recommendations(user_vector, tfidf_matrix, locations_df, top_n=50)

# STEP 2: Collaborative Filtering
interaction_matrix = create_user_location_matrix(reviews_df)
similar_users = get_top_k_similar_users(interaction_matrix, target_user_id, k=5)
collab_scores = predict_ratings_for_user(interaction_matrix, similar_users, target_user_id)

# STEP 3: Budget Filtering
location_costs = estimate_location_cost(reviews_df, trips_df)
filtered_content = apply_budget_filter(content_recs, location_costs, trip_budget)

# STEP 4: Combine Scores
final_recommendations = combine_scores(filtered_content, collab_scores, weight_content=0.6, weight_collab=0.4)

# Show Top N Results
print(final_recommendations[['location_name', 'hybrid_score', 'estimated_cost']].head(10))


                            location_name  hybrid_score  estimated_cost
32     Kanha National Park'S Buffer Zones      0.600000       17266.218
9                                  Orchha      0.570310       11500.000
21                                  Kalpa      0.432409       14500.000
16                         Tirthan Valley      0.400420       13405.845
24                                 Jawhar      0.363935       15150.500
18                                 Shojha      0.350902       14165.170
26                            Polo Forest      0.323216       15600.000
23  Athirappilly And Vazhachal Waterfalls      0.299470       15126.440
28                               Thenmala      0.248964       16050.750
30                           Araku Valley      0.211222       16200.000


In [12]:
from sklearn.model_selection import train_test_split

def train_test_split_reviews(reviews_df, test_size=0.2, random_state=42):
    return train_test_split(reviews_df, test_size=test_size, random_state=random_state)

def evaluate_rating_predictions(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

def evaluate_rating_predictions(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

def precision_recall_at_k(actual, predicted, k=10):
    actual_set = set(actual[:k])
    predicted_set = set(predicted[:k])
    intersection = actual_set & predicted_set

    precision = len(intersection) / float(k)
    recall = len(intersection) / float(len(actual)) if actual else 0.0
    return precision, recall

def dcg_at_k(relevance_scores, k):
    relevance_scores = np.asarray(relevance_scores)[:k]
    if relevance_scores.size:
        return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.

def ndcg_at_k(actual, predicted, k=10):
    relevance = [1 if item in actual else 0 for item in predicted[:k]]
    ideal_relevance = sorted(relevance, reverse=True)
    dcg = dcg_at_k(relevance, k)
    idcg = dcg_at_k(ideal_relevance, k)
    return dcg / idcg if idcg > 0 else 0.

In [13]:
def evaluate_topk(ground_truth, predicted_top_k, k=10):
    precisions, recalls, ndcgs = [], [], []

    for user_id in ground_truth:
        actual = ground_truth[user_id]
        predicted = predicted_top_k.get(user_id, [])

        precision, recall = precision_recall_at_k(actual, predicted, k)
        ndcg = ndcg_at_k(actual, predicted, k)

        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)

    return {
        "Precision@K": np.mean(precisions),
        "Recall@K": np.mean(recalls),
        "NDCG@K": np.mean(ndcgs)
    }


In [15]:
from sklearn.model_selection import train_test_split

train_reviews, test_reviews = train_test_split(reviews_df, test_size=0.2, random_state=42)

from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def compute_rmse_mae(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae


In [20]:
def recommend_locations(travel_category, preferred_state, trip_budget, top_k=10, user_id=None): 
        """ Generate top-k hybrid travel location recommendations.
        Parameters:
        travel_category (str): e.g., 'Adventure', 'Nature'
        preferred_state (str): e.g., 'Himachal Pradesh'
        trip_budget (float): Maximum trip budget
        top_k (int): Number of locations to recommend
        user_id (int): ID of the target user (optional, used for collaborative filtering)
    
    Returns:
        pd.DataFrame: Top recommended locations with hybrid scores
    """
    
        # Step 1: Content-Based Filtering
        tfidf_matrix, tfidf = prepare_location_features(locations_df)
        user_vector = create_user_preference_vector(travel_category, preferred_state, tfidf)
        content_recs = get_content_based_recommendations(user_vector, tfidf_matrix, locations_df, top_n=50)
        
        # Step 2: Collaborative Filtering
        interaction_matrix = create_user_location_matrix(reviews_df)
        
        if user_id is not None and user_id in interaction_matrix.index:
            similar_users = get_top_k_similar_users(interaction_matrix, user_id, k=5)
            collab_scores = predict_ratings_for_user(interaction_matrix, similar_users, user_id)
        else:
            collab_scores = pd.DataFrame(columns=["location_id", "predicted_rating"])
        
        # Step 3: Budget Filtering
        location_costs = estimate_location_cost(reviews_df, trips_df)
        filtered_content = apply_budget_filter(content_recs, location_costs, trip_budget)
        
        # Step 4: Combine content + collaborative scores
        final_recommendations = combine_scores(filtered_content, collab_scores, weight_content=0.6, weight_collab=0.4)
        
        # Return sorted top-K recommendations
        return final_recommendations.sort_values("hybrid_score", ascending=False).head(top_k)

In [23]:
recommended_locations = recommend_locations(travel_category="Adventure", preferred_state="Maharashtra", trip_budget=15000, top_k=10, user_id=10) 

print(recommended_locations[["location_name", "hybrid_score", "estimated_cost"]])

                          location_name  hybrid_score  estimated_cost
2                            Meghamalai      0.600000       7987.6500
3                          Malshej Ghat      0.488734       9345.6700
22                               Amboli      0.460615      14769.3100
8        Pangong Tso West Bank Villages      0.400000      11423.0225
0                         Poovar Island      0.000000       5611.6900
4                            Mokokchung      0.000000       9345.6700
5                               Kachchh      0.000000       9861.9750
6   Coorg'S Offbeat Trails And Villages      0.000000       9994.2000
7                           Ziro Valley      0.000000      10500.5000
9          Chopta-Tungnath-Chandrashila      0.000000      11439.3300


In [24]:
from collections import defaultdict

test_user_ids = reviews_df["user_id"].unique()[:20] # Adjust the slice as needed

ground_truth = defaultdict(list) 
predicted_top_k = {}

for uid in test_user_ids: 
# STEP 1: Content-Based 
    user_vector = create_user_preference_vector(travel_category, preferred_state, tfidf) 
    content_recs = get_content_based_recommendations(user_vector, tfidf_matrix, locations_df, top_n=50)
    # STEP 2: Collaborative
    similar_users = get_top_k_similar_users(interaction_matrix, uid, k=5)
    collab_scores = predict_ratings_for_user(interaction_matrix, similar_users, uid)
    
    # STEP 3: Budget
    filtered_content = apply_budget_filter(content_recs, location_costs, trip_budget)
    
    # STEP 4: Combine
    final_recommendations = combine_scores(filtered_content, collab_scores, weight_content=0.6, weight_collab=0.4)
    
    top_recommended = final_recommendations.sort_values("hybrid_score", ascending=False).head(10)
    predicted_top_k[uid] = top_recommended["location_id"].tolist()
    
    # Get actual visited locations from reviews
    actual_locations = reviews_df[reviews_df["user_id"] == uid]["location_id"].unique().tolist()
    ground_truth[uid] = actual_locations

In [25]:
metrics = evaluate_topk(ground_truth, predicted_top_k, k=10)
print(metrics)

{'Precision@K': 0.045, 'Recall@K': 0.225, 'NDCG@K': 0.19652406521325605}
