In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

In [168]:
jan_apr = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/jan_apr.csv')
may_june = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/may_june.csv')
jul_aug = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/jul_aug.csv')

data = pd.concat([jan_apr, may_june, jul_aug])

# Make new columns

In [169]:
data.head()

Unnamed: 0,Reservation ID,Channel,Affiliated Channel,Booked At,Modified At,Cancelled At,Room,Check-In,Check-Out,Guests,Adults,Children,Infants
0,2468606756,Booking.com,,2022-01-01 13:19:36,,,Apartamento - Piso Térreo - 24% - Oferta InÃ­c...,2022-01-02,2022-01-03,B****** C*******,2,,
1,3404160971,Booking.com,,2022-01-02 21:57:44,,,Apartamento com 1 Quarto (2 Adultos) - Rua dos...,2022-01-03,2022-01-05,M***** K******,2,,
2,3545920362,Booking.com,,2022-01-03 02:29:23,,,Apartamento com 1 Quarto - Rua de Santo Estêvã...,2022-01-03,2022-01-09,S****** M******,1,,
3,3807966157,Booking.com,,2022-01-03 17:48:12,,,Apartamento - Piso Térreo - High Season 3 Nights,2022-01-03,2022-01-06,W****** C****,2,,
4,2769935846,Booking.com,,2022-01-01 22:51:52,,,Apartamento com 2 Quartos (4 Adultos) - Rua do...,2022-01-03,2022-01-08,", S***** N*******",2,1.0,


In [170]:
data['Booked At'] = pd.to_datetime(data['Booked At'], format='%Y/%m/%d %H:%M:%S')
data['Booked At'] = data['Booked At'].dt.date
data['Booked At'] = pd.to_datetime(data['Booked At'], format='%Y/%m/%d')
data['Check-In'] = pd.to_datetime(data['Check-In'], format='%Y/%m/%d')
data['Check-Out'] = pd.to_datetime(data['Check-Out'], format='%Y/%m/%d')
data['Cancelled At'] = pd.to_datetime(data['Cancelled At'], format='%Y/%m/%d %H:%M:%S')
data['Cancelled At'] = data['Cancelled At'].dt.date
data['Cancelled At'] = pd.to_datetime(data['Cancelled At'], format='%Y/%m/%d')

In [171]:
data['days_ahead'] = data['Check-In'] - data['Booked At']
data['days_ahead'] = data['days_ahead'].dt.days

In [172]:
data['discounted'] = data['Room'].str.find('%') != -1
data['discounted'] = data['discounted'].astype(int)

In [173]:
data['stay'] = data['Check-Out'] - data['Check-In']
data['stay'] = data['stay'].dt.days

In [174]:
data['canceled_after_booking'] = data['Cancelled At'] - data['Booked At']
data['canceled_after_booking'] = data['canceled_after_booking'].dt.days

In [175]:
data['modified'] = data['Modified At'].notnull()

In [176]:
data['from_affiliation'] = data['Affiliated Channel'].notnull()

# Remove pointless cols

In [177]:
data = data.drop(columns=['Booked At', 'Room', 'Guests', 'Modified At', 'Affiliated Channel'])

In [178]:
data.head()

Unnamed: 0,Reservation ID,Channel,Cancelled At,Check-In,Check-Out,Adults,Children,Infants,days_ahead,discounted,stay,canceled_after_booking,modified,from_affiliation
0,2468606756,Booking.com,NaT,2022-01-02,2022-01-03,2,,,1,1,1,,False,False
1,3404160971,Booking.com,NaT,2022-01-03,2022-01-05,2,,,1,1,2,,False,False
2,3545920362,Booking.com,NaT,2022-01-03,2022-01-09,1,,,0,0,6,,False,False
3,3807966157,Booking.com,NaT,2022-01-03,2022-01-06,2,,,0,0,3,,False,False
4,2769935846,Booking.com,NaT,2022-01-03,2022-01-08,2,1.0,,2,0,5,,False,False


array([                          'NaT', '2022-01-06T00:00:00.000000000',
       '2022-01-02T00:00:00.000000000', '2022-01-22T00:00:00.000000000',
       '2022-01-18T00:00:00.000000000', '2022-01-25T00:00:00.000000000',
       '2022-01-31T00:00:00.000000000', '2022-01-20T00:00:00.000000000',
       '2022-01-24T00:00:00.000000000', '2022-03-05T00:00:00.000000000',
       '2022-01-11T00:00:00.000000000', '2022-02-15T00:00:00.000000000',
       '2022-04-08T00:00:00.000000000', '2022-01-10T00:00:00.000000000',
       '2022-04-21T00:00:00.000000000', '2022-03-06T00:00:00.000000000',
       '2022-04-25T00:00:00.000000000', '2022-02-04T00:00:00.000000000',
       '2022-01-27T00:00:00.000000000', '2022-01-21T00:00:00.000000000',
       '2022-02-08T00:00:00.000000000', '2022-01-23T00:00:00.000000000',
       '2022-02-22T00:00:00.000000000', '2022-02-14T00:00:00.000000000',
       '2022-02-05T00:00:00.000000000', '2022-02-20T00:00:00.000000000',
       '2022-01-28T00:00:00.000000000', '2022-01-26