In [67]:
import pandas as pd
import numpy as np
import random
import pickle

In [2]:
model = pickle.load(open('../models/random_forest.pkl', 'rb'))
pipeline = pickle.load(open('../models/pipeline.pkl', 'rb'))
data = pd.read_csv('../data/processed/test_processed.csv')

In [80]:
def get_waiting_price(row):
    if row['waiting_days'] != 0:
        idx = row['waiting_days'] - 1
        list_prices = row['hist_prices'].replace('[', '').replace(']', '').split(',')
        waiting_price = float(list_prices[idx])
        if np.isnan(waiting_price):
            waiting_price = row['price']
    else:
        waiting_price = row['price']
        
    return waiting_price

In [150]:
class Simulator():
    def __init__(self, n, flights, model, pipeline):
        self.n = n
        self.flights = flights
        self.model = model
        self.pipeline = pipeline
        
    def generate_travellers(self):
        routes = self.flights['orig-dest'].unique()
        departures = self.flights['dDate'].unique()
        requests = self.flights['days_until_dep'].unique()
        
        travellers_routes = random.choices(routes, k=self.n)
        travellers_dep_date = random.choices(departures, k=self.n)
        travellers_req_date = random.choices(requests, k=self.n)
        
        self.travellers = pd.DataFrame({'route': travellers_routes,
                                        'departure': travellers_dep_date,
                                        'request':travellers_req_date})
        
    def get_cheapest_flights(self):
        # select cheapest flight for each traveler
        merged = pd.merge(self.travellers, self.flights, left_on=['departure', 'route'], right_on=['dDate', 'orig-dest'])
        group_cols = list(self.travellers.columns)
        cheapest_indexes = merged.groupby(group_cols)['price'].idxmin()
        self.cheapest_flights = merged.loc[cheapest_indexes].drop(group_cols, axis=1)

    def prepare_data(self, data):
        num_attribs = ['days_until_dep', 'fly_duration', 'day_of_month', 'log_price', 'hops', 'competition']
        cat_attribs = ['flyFrom', 'flyTo', 'day_of_week', 'session']
    
        data_prepared = self.pipeline.transform(data[num_attribs+cat_attribs])
        return data_prepared
    
    def make_predictions(self):
        data_prepared = self.prepare_data(self.cheapest_flights)
        predicted = self.model.predict(data_prepared).round()
        self.cheapest_flights['waiting_days'] = predicted.astype(int)
        
    def compute_savings(self):
        df = self.cheapest_flights
        df['waiting_price'] = df.apply(get_waiting_price, axis=1)
        df['savings'] = df['price'] - df['waiting_price']
        
    def visualize_results(self):
        df = self.cheapest_flights[['price', 'waiting_days', 'waiting_price', 'savings']]
        
        
        results = {'Current': [df['price'].sum(), df['price'].mean()],
                   'Model': [df['waiting_price'].sum(), df['waiting_price'].mean()],
                   'Savings':[df['savings'].sum(), df['savings'].mean()]}
        
        df = pd.DataFrame(results, index=['Total', 'By Traveler'])
        print(df.to_string())
        
    def run(self):
        self.generate_travellers()
        self.get_cheapest_flights()
        self.make_predictions()
        self.compute_savings()
        self.visualize_results()
    

In [151]:
sim = Simulator(20, data, model, pipeline)

In [152]:
sim.run()

             Current    Model  Savings
Total        1973.00  2349.00   -376.0
By Traveler    98.65   117.45    -18.8
