In [57]:
import typing

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split

# _features = {"hotel_star_rating": (0, 5),
#              "no_of_adults": (1, 19),
#              "no_of_children": (0, 8),
#              "no_of_extra_bed": (0, 4),
#              "no_of_room": (1, 9),
#              "waterfront": (0, 1),
#              "view": (0, 4),
#              "condition": (1, 5),
#              "grade": (1, 13),
#              "sqft_above": (250, 10000),
#              _sqft_basement_label: (0, 5000),
#              "yr_built": (1900, 2015),
#              _yr_renovated_label: (0, 2015),
#              "zipcode": (98000, 99000),
#              "lat": (47, 48),
#              "long": (-123, -121)}

_dates = ["booking_datetime", "checkin_date", "checkout_date", "hotel_live_date", "cancellation_datetime"]
_irrelevant_features = ["h_booking_id", "hotel_chain_code", "hotel_brand_code", "request_earlycheckin",
                        "request_airport", "request_twinbeds", "request_largebed", "request_highfloor",
                        "request_latecheckin", "request_nonesmoke"]
_categorial_features = ["hotel_country_code", "accommadation_type_name", "charge_option",
                        "customer_nationality", "guest_nationality_country_name", "origin_country_code",
                        "original_payment_method", "original_payment_type", "original_payment_currency",
                        "hotel_area_code", "is_first_booking", "is_user_logged_in"]  #"cancellation_policy_code",

In [56]:
def split_data(X: pd.DataFrame):
    # Splitting the DataFrame into three parts: train, validation, and test
    train_df, temp_df = train_test_split(X, test_size=0.4, random_state=42)
    validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    # Printing the sizes of the resulting DataFrames
    print("Train set size:", len(train_df))
    print("Validation set size:", len(validation_df))
    print("Test set size:", len(test_df))
    return train_df, test_df, validation_df

In [None]:
def _fill_missings_values(X: pd.DataFrame):
    """
    fills missings values by prediction Parameters
    ----------
    X : DataFrame of shape (n_samples, n_features)
        Design matrix of regression problem
    """

    model = LinearRegression(include_intercept=True)
    for label in X.columns:
        label_X = X.dropna(subset=X.columns.difference([label]))
        data_to_pred = label_X[label_X[label].isnull()]
        if data_to_pred.empty:
            continue

        X_not_null = label_X[label_X[label].notna()]
        y_pred_train = X_not_null[label]
        X_pred_train = X_not_null.drop(label, axis=1)
        X_pred_test = data_to_pred.drop(label, axis=1)
        model.fit(X_pred_train, y_pred_train)

        predicted_vals = model.predict(X_pred_test).round(decimals=2)
        X[label].update(pd.Series(predicted_vals, index=X_pred_test.index))

In [15]:
def load_data(filename: str) -> pd.DataFrame:
    """
    Load city daily temperature dataset and preprocess data.
    Parameters
    ----------
    filename: str
        Path to house prices dataset

    Returns
    -------
    Design matrix and response vector (Temp)
    """

    df = pd.read_csv(filename, parse_dates=_dates)

    return df

In [42]:
def add_extra_features(X: pd.DataFrame):
    X['order_canceled'] = np.where(df['cancellation_datetime'] != np.nan, 1, 0)
    X['duration_days'] = (X['checkin_date'] - X['checkout_date']).dt.days
    X['booked_days_before'] = (X['booking_datetime'] - X['checkin_date']).dt.days
    X['cencel_code_day_one'] = df.apply(lambda row: parse_code_day_one(row['cancellation_policy_code']), axis=1)
    X['cencel_code_return_one'] = df.apply(lambda row: parse_code_return_one(row['cancellation_policy_code']), axis=1)
    X['cencel_code_day_two'] = df.apply(lambda row: parse_code_day_two(row['cancellation_policy_code']), axis=1)
    X['cencel_code_return_two'] = df.apply(lambda row: parse_code_return_two(row['cancellation_policy_code']), axis=1)
    X['parse_code_no_show'] = df.apply(lambda row: parse_code_no_show(row['cancellation_policy_code']), axis=1)



In [48]:
import re


def preprocess_remove_columns_add_dummy(X: pd.DataFrame):
    for feat in _irrelevant_features:
        X.drop(feat, axis=1, inplace=True)
        print(X[feat])
    X = pd.get_dummies(df, prefix=_categorial_features, columns=_categorial_features)
    return X


def parse_code_day_one(row):
    numeric_values = re.findall(r'\d+', row)
    alphabetic_substrings = re.findall(r'[a-zA-Z]+', row)
    try:
        if alphabetic_substrings[0] == 'D':
            return float(numeric_values[0])
    except:
        return 0
    return 0


def parse_code_return_one(row):
    numeric_values = re.findall(r'\d+', row)
    alphabetic_substrings = re.findall(r'[a-zA-Z]+', row)
    try:
        if alphabetic_substrings[1] == 'P':
            return float(numeric_values[1]) / 100
        elif alphabetic_substrings[1] == 'N':
            return -1 * float(numeric_values[1])
        else:
            return 0
    except:
        return 0


def parse_code_day_two(row):
    numeric_values = re.findall(r'\d+', row)
    alphabetic_substrings = re.findall(r'[a-zA-Z]+', row)
    try:
        if alphabetic_substrings[2] == 'D':
            return float(numeric_values[2])
    except:
        return 0
    return 0


def parse_code_return_two(row):
    numeric_values = re.findall(r'\d+', row)
    alphabetic_substrings = re.findall(r'[a-zA-Z]+', row)
    try:
        if alphabetic_substrings[3] == 'P':
            return float(numeric_values[1]) / 100
        elif alphabetic_substrings[3] == 'N':
            return -1 * float(numeric_values[1])
        else:
            return 0
    except:
        return 0


def parse_code_no_show(row):
    numeric_values = re.findall(r'\d+', row)
    alphabetic_substrings = re.findall(r'[a-zA-Z]+', row)
    try:
        if len(alphabetic_substrings) % 2 != 0:
            if alphabetic_substrings[-1] == 'P':
                return float(numeric_values[-1]) / 100
            if alphabetic_substrings[-1] == 'N':
                return -1 * float(numeric_values[1])
        return 0
    except:
        return 0







In [None]:
def proccess_dates(df: pd.DataFrame):
    for label in _dates:
        df[f"{label}_dayofyear"] = df[label].dt.dayofyear
        df[f"{label}_year"] = df[label].dt.year

In [None]:
def preprocess_data(X: pd.DataFrame, y: typing.Optional[pd.Series] = None):
    """
    preprocess data
    Parameters
    ----------
    X : DataFrame of shape (n_samples, n_features)
        Design matrix of regression problem

    y : array-like of shape (n_samples, )
        Response vector corresponding given samples

    Returns
    -------
    Post-processed design matrix and response vector (prices) - either as a single
    DataFrame or a Tuple[DataFrame, Series]
    """

    is_train = y is not None
    if is_train:
        X = X.assign(order_canceled=y)
        X = X.drop_duplicates()


    X = X.drop(_irrelevant_features, axis=1)  # Irrelevant features

    proccess_dates(X)
    X = X.drop(_dates, axis=1)  # Irrelevant features

    for label in X:  # Replaces invalid values with temporary nan value
        X[label] = X[label].mask(~X[label].between(X[label][0], X[label][1], inclusive="both"), np.nan)

    for category in _categorial_features:  # Handles categorial features
        X[category] = X[category].astype('category')
        X = pd.get_dummies(X, prefix=category, columns=[category])

    add_extra_features(X)

    _fill_missings_values(X)
    if not is_train:
        return X

    X = X.reset_index(drop=True)
    post_processed_y = X["y_train"]
    return X.drop("y_train", axis=1), post_processed_y



In [58]:
#read data
# Specify the file path
file_path = './data_files/agoda_cancellation_train.csv'
# Read the CSV file into a DataFrame
df = load_data(file_path)

In [59]:
# Print the DataFrame
print(df.head())
print(df.info)

          h_booking_id    booking_datetime checkin_date checkout_date  \
0 -9223194055642672935 2018-06-28 21:15:00   2018-07-09    2018-07-13   
1 -9222713784330706132 2018-08-10 22:31:00   2018-08-16    2018-08-17   
2 -9222411208325704942 2018-09-14 07:55:00   2018-09-14    2018-09-15   
3 -9222220845872895471 2018-06-25 07:33:00   2018-07-02    2018-07-03   
4 -9221127186162682116 2018-07-23 10:06:00   2018-08-09    2018-08-10   

   hotel_id hotel_country_code     hotel_live_date  hotel_star_rating  \
0      6452                 HK 2009-06-28 02:02:00                4.0   
1     47729                 CN 2011-06-07 11:52:00                4.0   
2    780431                 KR 2014-11-20 15:43:00                4.0   
3    291365                 JP 2011-11-21 12:27:00                3.0   
4    479046                 TH 2013-06-06 10:04:00                3.0   

  accommadation_type_name charge_option  ...  request_highfloor  \
0                   Hotel       Pay Now  ...           

In [61]:
add_extra_features(df)
df = preprocess_remove_columns_add_dummy(df)
df.nunique
#add columns
# print(df['order_canceled'].head(100))
# print(df['cencel_code_day_two'].head(100))

KeyError: "None of [Index(['hotel_country_code', 'accommadation_type_name', 'charge_option',\n       'customer_nationality', 'guest_nationality_country_name',\n       'origin_country_code', 'original_payment_method',\n       'original_payment_type', 'original_payment_currency', 'hotel_area_code',\n       'is_first_booking', 'is_user_logged_in'],\n      dtype='object')] are in the [columns]"

In [63]:
from Classification import Classification

train_df, test_df, validation_df = split_data(df)
Classification().run_all(train_df, train_df['order_canceled'], test_df, test_df['order_canceled'])

Train set size: 35195
Validation set size: 11732
Test set size: 11732


TypeError: float() argument must be a string or a number, not 'Timestamp'