## Load ipython-sql and connect to database

In [1]:
%load_ext sql

In [2]:
import os, re
from IPython.display import display_html

CONNECTION_STRING = os.getenv('AWSGPDBCONN')

cs = re.match('^postgresql:\/\/(\S+):(\S+)@(\S+):(\S+)\/(\S+)$', CONNECTION_STRING)

DB_USER   = cs.group(1)
DB_PWD    = cs.group(2)
DB_SERVER = cs.group(3)
DB_PORT   = cs.group(4)
DB_NAME   = cs.group(5)

%reload_ext sql
%sql $CONNECTION_STRING

'Connected: gpadmin@gpadmin'

In [3]:
%sql SELECT VERSION();

 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
1 rows affected.


version
"PostgreSQL 9.4.24 (Greenplum Database 6.3.0 build commit:77aa1b6e4486adbaede9f5f2864a04fc3a512e93) on x86_64-unknown-linux-gnu, compiled by gcc (GCC) 6.4.0, 64-bit compiled on Jan 9 2020 23:10:47"


In [4]:
%sql SET search_path=pricing, public;

 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
Done.


[]

# Pricing Optimization

### 1_Run_Linear_Regression

In [15]:
%%sql
DROP TABLE IF EXISTS pricing.model_results CASCADE;
DROP TABLE IF EXISTS pricing.model_results_summary CASCADE;

-- You can choose to analyze the pricing.flight_history table if you want.

SELECT madlib.linregr_train(
    'pricing.flight_history',
    'pricing.model_results',
    'Sales',
    'ARRAY[
    -- intercept
    1,
    --prices
    Price,
    Price_Comp1,
    Price_Comp2,
    Price_Comp3, 
    Price_Comp4, 
    --seasonality 
    Flight_Month, 
    CASE WHEN Flight_Weekday=1 THEN 1 ELSE 0 END, 
    CASE WHEN Flight_Weekday=2 THEN 1 ELSE 0 END, 
    CASE WHEN Flight_Weekday=3 THEN 1 ELSE 0 END, 
    CASE WHEN Flight_Weekday=4 THEN 1 ELSE 0 END, 
    CASE WHEN Flight_Weekday=5 THEN 1 ELSE 0 END, 
    CASE WHEN Flight_Weekday=6 THEN 1 ELSE 0 END, 
    CASE WHEN Flight_Weekday=7 THEN 1 ELSE 0 END, 
    Holiday_Indicator, 
    --trend
    CURRENT_DATE-flight_date]',
    'routeid, origin, destination, class, days_to_flight'
);


 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
Done.
Done.
1 rows affected.


linregr_train


### 2_Score_Linear_Regression_Model

In [16]:
%%sql
ANALYZE pricing.to_be_priced_flights;
ANALYZE pricing.model_results;

DROP TABLE IF EXISTS pricing.scoring CASCADE;
CREATE TABLE pricing.scoring AS (
    SELECT
        t.RouteID, t.Route_Origin, t.Route_Destination, t.Class, t.Flight_Date, t.Days_To_Flight,
        LEAST(coef[2], -0.0001) as price_coef,
        madlib.array_dot (
        ARRAY[
        1,
        0, -- Own price is zeroed out
        t.Price_Comp1,
        t.Price_Comp2,
        t.Price_Comp3,
        t.Price_Comp4,
        t.Flight_Month,
        CASE WHEN t.Flight_Weekday = 1 THEN 1 ELSE 0 END,
        CASE WHEN t.Flight_Weekday = 2 THEN 1 ELSE 0 END,
        CASE WHEN t.Flight_Weekday = 3 THEN 1 ELSE 0 END,
        CASE WHEN t.Flight_Weekday = 4 THEN 1 ELSE 0 END,
        CASE WHEN t.Flight_Weekday = 5 THEN 1 ELSE 0 END,
        CASE WHEN t.Flight_Weekday = 6 THEN 1 ELSE 0 END,
        CASE WHEN t.Flight_Weekday = 7 THEN 1 ELSE 0 END,
        Holiday_Indicator,
        CURRENT_DATE - flight_date
        ]::FLOAT8[], model.coef) as predicted_sales
    FROM
        pricing.to_be_priced_flights t, pricing.model_results model
    WHERE
        t.RouteID = model.RouteID
        AND t.Class = model.Class
        AND t.Days_To_Flight = model.Days_To_Flight AND model.coef is NOT NULL
  )
DISTRIBUTED BY(RouteID);

 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
Done.
Done.
Done.
60469668 rows affected.


[]

### 3_Input_For_QP

In [17]:
%%sql
ANALYZE pricing.scoring;

DROP TABLE IF EXISTS pricing.input_for_QP CASCADE;

CREATE TABLE pricing.input_for_QP AS
    SELECT routeID
        , Route_Origin
        , Route_Destination
        , class
        , flight_Date
        , array_agg(price_coef order by days_to_flight) price_coef_array
        , array_agg(predicted_sales order by days_to_flight) predicted_sales_array
    FROM pricing.scoring
    GROUP BY routeID
        , Route_Origin
        , Route_Destination
        , class
        , flight_Date
DISTRIBUTED BY (routeID);

 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
Done.
Done.
4319262 rows affected.


[]

### 4_Optimize_Daily_Prices

In [18]:
%%sql
CREATE OR REPLACE FUNCTION pricing.r_max (integer, integer) RETURNS integer AS $$
if (arg1 > arg2) 
    return(arg1) 
else 
    return(arg2)
$$ LANGUAGE plr STRICT;

 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
Done.


[]

In [19]:
%%sql

CREATE OR REPLACE FUNCTION r_solve_QP(a float8[], b float8[], c integer)
RETURNS float8[] AS 
$$
    library(quadprog)
    
    Dmat<- matrix(0,14,14)
    diag(Dmat) <- -2*a 
    dvec <- b 
    
    Amat <- matrix(0,14,15)
    Amat[,1] <- -a
    
    diag(Amat[,2:15]) <- a
    bvec  <- c(-c+sum(b),-b)

    qp<-solve.QP(Dmat,dvec,Amat,bvec=bvec)
    
    return(qp$solution)
$$ 
LANGUAGE 'plr';

 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
Done.


[]

In [20]:
%%sql
ANALYZE pricing.input_for_qp;

DROP TABLE IF EXISTS pricing.optimal_prices CASCADE;

CREATE TABLE pricing.optimal_prices
AS
SELECT
  routeid, Route_Origin, Route_Destination,
  "class",
  flight_date,
  CASE
    WHEN class='Economy' THEN pricing.r_solve_QP(price_coef_array, predicted_sales_array, 200)
    WHEN class='Business' THEN pricing.r_solve_QP(price_coef_array, predicted_sales_array, 15)
    ELSE pricing.r_solve_QP(price_coef_array, predicted_sales_array, 10) END as optimal_prices
FROM pricing.input_for_qp
DISTRIBUTED BY (routeid, "class", flight_date);

 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
Done.
Done.
4319262 rows affected.


[]

### 5_Query_Optimized_Prices

In [21]:
%%sql
SELECT 
    Route_Origin, 
    Route_Destination, 
    class, 
    flight_date, 
    unnest(array[1,2,3,4,5,6,7,8,9,10,11,12,13,14]) as days_to_flight, 
    unnest(optimal_prices) as optimal_pricing
FROM 
    pricing.optimal_prices 
WHERE 
    route_origin = 'MIA' AND 
    route_destination = 'JFK' AND
    flight_date BETWEEN '2011-11-01' AND '2011-11-30'
ORDER BY random()
LIMIT 10;

 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
10 rows affected.


route_origin,route_destination,class,flight_date,days_to_flight,optimal_pricing
MIA,JFK,Business,2011-11-21,10,2022.37972092919
MIA,JFK,First-Class,2011-11-01,10,2497.68119706834
MIA,JFK,First-Class,2011-11-21,14,4264.26838910404
MIA,JFK,Economy,2011-11-11,12,870.028995177174
MIA,JFK,First-Class,2011-11-04,13,3223.02987724705
MIA,JFK,Economy,2011-11-22,4,1069.25507710269
MIA,JFK,Economy,2011-11-05,12,869.535459127087
MIA,JFK,First-Class,2011-11-29,6,3271.26679863205
MIA,JFK,First-Class,2011-11-04,2,5439.98201382688
MIA,JFK,First-Class,2011-11-30,7,5812.33670345398


In [22]:
%%sql
SELECT 
    Route_Origin, 
    Route_Destination, 
    class, 
    flight_date, 
    unnest(array[1,2,3,4,5,6,7,8,9,10,11,12,13,14]) as days_to_flight, 
    unnest(optimal_prices) as optimal_pricing
FROM 
    pricing.optimal_prices 
WHERE 
    route_origin = 'MIA' AND 
    route_destination = 'JFK' AND
    flight_date BETWEEN '2011-11-01' AND '2011-11-30' AND 
    class='Business'
ORDER BY random()
LIMIT 10;

 * postgresql://gpadmin:***@ec2-3-9-174-91.eu-west-2.compute.amazonaws.com:5432/gpadmin
10 rows affected.


route_origin,route_destination,class,flight_date,days_to_flight,optimal_pricing
MIA,JFK,Business,2011-11-01,14,1462.57407417378
MIA,JFK,Business,2011-11-03,8,2183.54275509869
MIA,JFK,Business,2011-11-20,13,1357.77358046094
MIA,JFK,Business,2011-11-05,2,1698.48492665979
MIA,JFK,Business,2011-11-02,8,1580.90243872161
MIA,JFK,Business,2011-11-18,14,1979.78032788524
MIA,JFK,Business,2011-11-12,7,1479.4620434693
MIA,JFK,Business,2011-11-27,3,1492.48106117764
MIA,JFK,Business,2011-11-24,10,1662.26617314433
MIA,JFK,Business,2011-11-08,3,2016.36000142424
