In [0]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

import sklearn

from scipy.sparse import csr_matrix, hstack
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
import numpy as np


  import pandas.util.testing as tm


Collecting mixed-naive-bayes
  Downloading https://files.pythonhosted.org/packages/7a/1f/4af788fa4df56a0aa38cbe949f3c3021ece5200a2d777adb4eddf662468d/mixed_naive_bayes-0.0.1-py3-none-any.whl
Installing collected packages: mixed-naive-bayes
Successfully installed mixed-naive-bayes-0.0.1


In [0]:
test = pd.read_csv(base_dir + "/1percent/" + "1_test_set.csv")
train = pd.read_csv(base_dir + "/1percent/" + "1_train_set.csv")

In [0]:
# drop latent destinatio features (not gonna train NB with them)
train = train.drop(columns=train.columns[28:])
test = test.drop(columns=test.columns[28:])

In [0]:
y_train = train.hotel_cluster
y_test = test.hotel_cluster
train = train.drop(columns=["hotel_cluster"])
test = test.drop(columns=["hotel_cluster"])

### Train two different models and combine the probabilities
#### - Multinomial NB for the categorical features
#### - Gaussian NB for the numerical features

In [1]:
# DEFINING CATEGORICAL AND NUMERICAL FEATURES
cat_features = ["site_name", "posa_continent", "user_location_country", "user_location_region", "user_location_city",
                "user_id", "is_mobile", "is_package", "channel", "is_booking", "hotel_continent", "hotel_country",
                "hotel_market", "booking_month", "booking_day", "ci_month", "co_month", "ci_day", "co_day", "is_weekend"]
num_features = ["orig_destination_distance", "srch_adults_cnt", "srch_children_cnt", "srch_rm_cnt", "cnt", "duration", "remaining_days"]
print("NUM CATEGORICAL FEATURES => ", len(cat_features))
print("NUM NUMERICAL FEATURES => ", len(num_features))
print("TOTAL => ", len(train.columns), (len(cat_features) + len(num_features)))

#### TRAINING A MULTINOMIAL NB FOR CATEGORICAL DATA

In [0]:
# create list of categories for the encoding to have the same number of features for each set
categories = []
for i in cat_features:
    categories.append(np.unique(train[i]))

In [0]:
# datasets to be one hot encoded
X_train_cat = train[cat_features]
X_test_cat = test[cat_features]

In [0]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categories=categories, handle_unknown='ignore')
X_train_cat = enc.fit_transform(X_train_cat)
X_test_cat = enc.fit_transform(X_test_cat)
print(X_train_cat.shape)
print(X_test_cat.shape)

In [0]:
# multinomial NB 1% on categorical variables (y_train and y_test are the same for numerical and categorical)
mnb1 = MultinomialNB()
mnb1.fit(X_train_cat, y_train) # training the model
pred1_mnb = mnb1.predict_proba(X_test_cat) # predicting probabilities

#### TRAINING A GAUSSIAN NB MODEL FOR NUMERICAL FEATURES

In [0]:
# gaussian NB for numerical features on 1% of data
X_train_num = train[num_features]
X_test_num = test[num_features]
gnb1 = GaussianNB()
gnb1.fit(X_train_num, y_train) # train
pred1_gnb = gnb1.predict_proba(X_test_num) # predicting probabilities

### COMBINE BOTH PREDICTORS

In [0]:
tot_prob = pred1_gnb*pred1_mnb

### EVALUATING THE MODELS WITH MAP5

In [0]:
def map5eva(preds, actual):
    predicted = preds.argsort(axis=1)[:, -np.arange(5)]
    metric = 0
    for i in range(5):
        metric += np.sum(actual==predicted[:, i])/(i+1)
    metric /= actual.shape[0]
    return "MAP@5", -metric

In [None]:
map5eva(tot_prob, y_test)