In [1]:
import pandas as pd
import pickle

In [2]:
model = pickle.load(open('../models/random_forest.pkl', 'rb'))
pipeline = pickle.load(open('../models/pipeline.pkl', 'rb'))
data = pd.read_csv('../data/processed/test_processed.csv')

In [3]:
def prepare_data(data, pipeline):
    num_attribs = ['days_until_dep', 'fly_duration', 'day_of_month', 'log_price', 'hops', 'competition']
    cat_attribs = ['flyFrom', 'flyTo', 'day_of_week', 'session']
    
    data_prepared = pipeline.transform(data[num_attribs+cat_attribs])
    return data_prepared

In [4]:
import random
routes = data['orig-dest'].unique()   
random.choices(routes, k=5)

['BCN-PMI', 'MAD-BCN', 'BCN-FCO', 'MAD-JFK', 'MAD-JFK']

In [5]:
data['days_until_dep'].unique()

array([27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
       10,  8,  7,  6,  5,  4,  3,  2,  1, 28,  9, 29, 30])

In [6]:
X = prepare_data(data, pipeline)

In [7]:
model.predict(X[:10]).round()

array([0., 0., 4., 2., 0., 0., 4., 3., 3., 2.])

In [8]:
pd.DataFrame({'col1':[1,2,3], 'col2':[6,7,8]})

Unnamed: 0,col1,col2
0,1,6
1,2,7
2,3,8


In [81]:
class Simulator():
    def __init__(self, n, flights, model, pipeline):
        self.n = n
        self.flights = flights
        self.model = model
        self.pipeline = pipeline
        
    def generate_travellers(self):
        routes = self.flights['orig-dest'].unique()
        departures = self.flights['dDate'].unique()
        requests = self.flights['days_until_dep'].unique()
        
        travellers_routes = random.choices(routes, k=self.n)
        travellers_dep_date = random.choices(departures, k=self.n)
        travellers_req_date = random.choices(requests, k=self.n)
        
        self.travellers = pd.DataFrame({'route': travellers_routes,
                                        'departure': travellers_dep_date,
                                        'request':travellers_req_date})
        
    def get_cheapest_flights(self):
        # select cheapest flight for each traveler
        merged = pd.merge(self.travellers, self.flights, left_on=['departure', 'route'], right_on=['dDate', 'orig-dest'])
        group_cols = list(self.travellers.columns)
        cheapest_indexes = merged.groupby(group_cols)['price'].idxmin()
        self.cheapest_flights = merged.loc[cheapest_indexes].drop(group_cols, axis=1)

    def prepare_data(self, data):
        num_attribs = ['days_until_dep', 'fly_duration', 'day_of_month', 'log_price', 'hops', 'competition']
        cat_attribs = ['flyFrom', 'flyTo', 'day_of_week', 'session']
    
        data_prepared = self.pipeline.transform(data[num_attribs+cat_attribs])
        return data_prepared
    
    def predict(self):
        data_prepared = self.prepare_data(self.cheapest_flights)
        predicted = self.model.predict(data_prepared).round()
        self.predicted = predicted
    
    

In [82]:
sim = Simulator(10, data, model, pipeline)

In [83]:
sim.generate_travellers()

In [84]:
sim.travellers

Unnamed: 0,route,departure,request
0,MAD-TFN,2021-03-02,30
1,MAD-BCN,2021-03-02,18
2,MAD-MEX,2021-02-28,15
3,BCN-LGW,2021-02-28,3
4,MAD-LHR,2021-02-28,28
5,BCN-PMI,2021-03-02,20
6,BCN-FCO,2021-02-28,28
7,MAD-JFK,2021-03-01,23
8,BCN-LGW,2021-02-27,20
9,BCN-LGW,2021-03-01,25


In [85]:
sim.get_cheapest_flights()

In [86]:
sim.cheapest_flights

Unnamed: 0,flyFrom,flyTo,orig-dest,dDate,day_of_month,day_of_week,fly_duration,distance,days_until_dep,session,airline,hops,direct,competition,price,log_price,hist_prices,waiting_days,buy
4828,BCN,FCO,BCN-FCO,2021-02-28,28,Sunday,26.916667,847.82,27,night,FR,0,True,19,52,3.951244,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0
7087,BCN,LGW,BCN-LGW,2021-02-27,27,Saturday,20.75,1110.28,27,evening,UX,0,True,18,59,4.077537,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0
2524,BCN,LGW,BCN-LGW,2021-02-28,28,Sunday,5.583333,1110.28,28,morning,UX,0,True,19,51,3.931826,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0
7542,BCN,LGW,BCN-LGW,2021-03-01,1,Monday,16.666667,1110.28,29,morning,FR,0,True,16,38,3.637586,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0
4000,BCN,PMI,BCN-PMI,2021-03-02,2,Tuesday,0.833333,202.18,30,night,VY,0,True,10,22,3.091042,"[470.0, 470.0, 470.0, 470.0, 251.0, 161.0, 84....",0,0
444,MAD,BCN,MAD-BCN,2021-03-02,2,Tuesday,1.416667,483.25,30,morning,UX,0,True,13,34,3.526361,"[70.0, 69.0, 60.0, 59.0, 61.0, 53.0, 52.0, 53....",0,0
5747,MAD,JFK,MAD-JFK,2021-03-01,1,Monday,8.75,5767.2,29,evening,AY,0,True,19,151,5.01728,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0
2930,MAD,LHR,MAD-LHR,2021-02-28,28,Sunday,2.583333,1245.76,28,night,IB,0,True,29,45,3.806662,"[112.0, 112.0, 95.0, 112.0, 110.0, 135.0, 135....",0,0
1722,MAD,MEX,MAD-MEX,2021-02-28,28,Sunday,21.416667,9075.77,22,night,DL,0,True,31,215,5.370638,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",0,0
0,MAD,TFN,MAD-TFN,2021-03-02,2,Tuesday,2.833333,1774.11,30,morning,IB,0,True,6,45,3.806662,"[nan, nan, nan, nan, nan, 145.0, nan, nan, 118...",0,0


In [41]:
sim.selected_flights.groupby(['route', 'departure', 'request'])['price'].agg(['count', 'min']).reset_index()

Unnamed: 0,route,departure,request,count,min
0,BCN-AMS,2021-03-01,29,994,29
1,BCN-AMS,2021-03-02,3,897,53
2,BCN-FCO,2021-02-27,27,764,49
3,BCN-LGW,2021-03-02,7,386,59
4,BCN-LGW,2021-03-02,27,386,59
5,BCN-PMI,2021-02-28,9,501,22
6,BCN-PMI,2021-03-01,17,722,22
7,MAD-BCN,2021-03-02,17,746,34
8,MAD-EZE,2021-03-01,1,611,267
9,MAD-EZE,2021-03-01,2,611,267


In [43]:
df = sim.selected_flights

In [45]:
df.loc[df.groupby(['route', 'departure', 'request'])['price'].idxmin()]

3217     29
4214     53
5108     49
1222     59
1608     59
2716     22
1994     22
5872     34
750     267
139     267
Name: price, dtype: int64