In [1]:
import pandas as pd

def drop(df: pd.DataFrame) -> pd.DataFrame:
    if 'CompanyReservation' in df.columns:
        df = df.drop(columns=['CompanyReservation'])
    return df

def normalize_column_names(X: pd.DataFrame, y: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Lowercase all column names in both X and y."""
    X.columns = X.columns.str.lower()
    y.columns = y.columns.str.lower()
    return X, y


def rename_columns(X: pd.DataFrame) -> pd.DataFrame:
    """Rename selected columns for clarity and consistency."""
    rename_map = {
        "%paidinadvance": "percent_paid_in_advance",
        "countryoforiginavgincomeeuros (year-2)": "country_income_euros_y2",
        "countryoforiginavgincomeeuros (year-1)": "country_income_euros_y1",
        "countryoforiginhdi (year-1)": "country_hdi_y1",
    }
    return X.rename(columns=rename_map)


def create_arrivaltime_feature(X: pd.DataFrame) -> pd.DataFrame:
    """Create a datetime column from arrival date and time components."""
    X["hour"] = X["arrivalhour"].astype(int)
    X["minute"] = ((X["arrivalhour"] - X["hour"]) * 60).round().astype(int)
    
    X["arrivaltime"] = (
        X["arrivalyear"].astype(str) + "-" +
        X["arrivalmonth"].astype(str) + "-" +
        X["arrivaldayofmonth"].astype(str) + " " +
        X["hour"].astype(str) + ":" +
        X["minute"].astype(str).str.zfill(2)
    )
    X["arrivaltime"] = pd.to_datetime(X["arrivaltime"], format="%Y-%m-%d %H:%M")

    print("First customer of the year:", X["arrivaltime"].min())
    print("Last customer of the year:", X["arrivaltime"].max())

    return X


def merge_target_with_datetime(y: pd.DataFrame, X: pd.DataFrame) -> pd.DataFrame:
    """Merge target dataframe with arrival time."""
    return pd.merge(y, X[["bookingid", "arrivaltime"]], on="bookingid", how="inner")

In [3]:
import pandas as pd

In [9]:
df = pd.read_csv("/Users/mariananeto/Documents/GitHub/mlops_project_v2/hotel-california/data/02_intermediate/X_train_validated.csv")

In [14]:
y = pd.read_csv("/Users/mariananeto/Documents/GitHub/mlops_project_v2/hotel-california/data/03_primary/y_train_validated.csv")

In [15]:
y

Unnamed: 0,Canceled
0,0
1,1
2,1
3,0
4,1
...,...
11145,1
11146,1
11147,0
11148,0


In [10]:
df

Unnamed: 0,BookingID,ArrivalYear,ArrivalMonth,ArrivalWeekNumber,ArrivalDayOfMonth,ArrivalHour,WeekendStays,WeekdayStays,Adults,Children,...,CompanyReservation,OrderedMealsPerDay,FloorReserved,FloorAssigned,DailyRateEuros,DailyRateUSD,%PaidinAdvance,CountryofOriginAvgIncomeEuros (Year-2),CountryofOriginAvgIncomeEuros (Year-1),CountryofOriginHDI (Year-1)
0,10423,2016,3,13,21,18.00,1,2,2,0.0,...,1,1,6,3,85.00,84.1500,0.0,40141.59,40833.24,0.898
1,14038,2016,4,17,23,16.00,2,4,2,2.0,...,1,1,1,1,206.00,203.9400,0.0,7199.25,7096.60,0.572
2,37179,2016,12,53,28,16.00,0,1,2,0.0,...,1,1,6,6,138.00,136.6200,0.0,28742.44,29668.86,0.842
3,15019,2016,4,18,30,15.25,2,5,2,0.0,...,1,1,6,6,109.81,108.7119,0.0,28742.44,29668.86,0.842
4,30385,2016,5,21,17,15.50,0,2,2,0.0,...,1,1,6,5,140.00,138.6000,0.0,28742.44,29668.86,0.842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11145,12808,2016,10,44,29,17.50,2,1,2,0.0,...,1,1,6,6,90.90,89.9910,0.0,36194.87,36909.33,0.876
11146,34022,2016,10,44,26,17.25,0,2,1,0.0,...,1,1,6,6,130.00,128.7000,0.0,28742.44,29668.86,0.842
11147,30472,2016,4,18,28,17.25,0,2,2,0.0,...,1,1,6,6,111.60,110.4840,0.0,47011.55,47683.79,0.933
11148,20119,2016,7,30,20,21.50,0,2,2,1.0,...,1,1,6,3,126.00,124.7400,0.0,28742.44,29668.86,0.842


In [16]:
l = normalize_column_names(df, y)

In [17]:
l

(       bookingid  arrivalyear  arrivalmonth  arrivalweeknumber  \
 0          10423         2016             3                 13   
 1          14038         2016             4                 17   
 2          37179         2016            12                 53   
 3          15019         2016             4                 18   
 4          30385         2016             5                 21   
 ...          ...          ...           ...                ...   
 11145      12808         2016            10                 44   
 11146      34022         2016            10                 44   
 11147      30472         2016             4                 18   
 11148      20119         2016             7                 30   
 11149      32885         2016             9                 40   
 
        arrivaldayofmonth  arrivalhour  weekendstays  weekdaystays  adults  \
 0                     21        18.00             1             2       2   
 1                     23        16.00

In [21]:
import pandas as pd
df = pd.read_csv("/Users/mariananeto/Documents/GitHub/mlops_project_v2/hotel-california/data/03_primary/X_train_validated_final.csv")

In [25]:
df['arrivaltime'] = pd.to_datetime(df['arrivaltime'], errors='coerce')

In [26]:
df['arrivaltime']

0       2016-03-21 18:00:00
1       2016-04-23 16:00:00
2       2016-12-28 16:00:00
3       2016-04-30 15:15:00
4       2016-05-17 15:30:00
                ...        
11145   2016-10-29 17:30:00
11146   2016-10-26 17:15:00
11147   2016-04-28 17:15:00
11148   2016-07-20 21:30:00
11149   2016-09-25 16:00:00
Name: arrivaltime, Length: 11150, dtype: datetime64[ns]

In [27]:
df['arrivaltime'].dtype

dtype('<M8[ns]')

In [None]:
def clean(feature_view_name,feature_view_version,feature_group_name,feature_group_version, SETTINGS):
    """
    Utiliy function used during development to clean all the data from the feature store.
    """

    project = hopsworks.login(
        api_key_value=SETTINGS["FS_API_KEY"],
        project=SETTINGS["FS_PROJECT_NAME"],
    )
    fs = project.get_feature_store()

    print("Deleting feature views and training datasets...")
    try:
        feature_views = fs.get_feature_view(name=feature_view_name, version = feature_view_version)

        for feature_view in feature_views:
            try:
                feature_view.delete()
            except Exception as e:
                print(e)
    except Exception as e:
        print(e)

    print("Deleting feature groups...")
    try:
        feature_group = fs.get_feature_group(name=feature_group_name, version = feature_group_version)
        try:
            feature_group.delete()
        except Exception as e:
            print(e)
    except Exception as e:
        print(e)

In [None]:
clean('',0,'arrival_features',1,SETTINGS_STORE)

In [None]:
clean('',0,'guest_features',1,SETTINGS_STORE)

In [None]:
clean('',0,'booking_features',1,SETTINGS_STORE)

In [None]:
clean('',0,'financial_features',1,SETTINGS_STORE)

In [None]:
clean('',0,'numerical_features',1,SETTINGS_STORE)

In [None]:
clean('',0,'categorical_features',1,SETTINGS_STORE)

In [None]:
clean('',0,'target',1,SETTINGS_STORE)