In [13]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML

In [14]:
data = pd.read_pickle("data/data_work.pkl")

In [15]:
data["weekday"] = data["weekday"].astype('category')

In [16]:
formula_1 = "price_increase ~ loading_factor + days_since + route"
formula_2 = "price_increase ~ loading_factor + days_since * loading_factor + route"
formula_3 = "price_increase ~ loading_factor + days_since * loading_factor + route + weekday"
formula_4 = "price_increase ~ loading_factor + days_since * loading_factor + route + weekday + peak_hour"
formula_5 = "price_increase ~ loading_factor + days_since * loading_factor + route + weekday + peak_hour + sales_prev_day"

formulas = [formula_1,formula_2,formula_3,formula_4,formula_5]

In [24]:
def logit_estimate(formulas, data, days=[0,30]):
    output_dict = {}
    for formula in formulas:
        model = smf.logit(
            formula, data=data.loc[data["days_till_dep"].isin(range(days[0], days[1]))]
        )
        output_dict[formula] = model.fit(
            cov_type="cluster",
            cov_kwds={"groups": data.loc[model.data.row_labels, "train_id"]},
        )

    return output_dict

## 30 days

In [26]:
out = logit_estimate(formulas, data, days=[0,30])

Optimization terminated successfully.
         Current function value: 0.336835
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.336703
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.336211
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.334603
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.332756
         Iterations 7


In [27]:
stargazer = Stargazer(list(out.values()))
stargazer.rename_covariates({"Intercept": "Constant"})
output_1 = HTML(stargazer.render_html())

html = output_1.data
with open("regression_outputs/logit_output_30days.html", "w") as f:
    f.write(html)
HTML(stargazer.render_html())

0,1,2,3,4,5
,,,,,
,Dependent variable:price_increase,Dependent variable:price_increase,Dependent variable:price_increase,Dependent variable:price_increase,Dependent variable:price_increase
,,,,,
,(1),(2),(3),(4),(5)
,,,,,
Constant,-4.167***,-3.811***,-3.718***,-3.620***,-3.889***
,(0.098),(0.180),(0.189),(0.188),(0.190)
days_since,0.037***,0.020**,0.019**,0.018**,0.012
,(0.004),(0.008),(0.008),(0.008),(0.008)
days_since:loading_factor,,0.044**,0.047**,0.049**,0.048**


## last 5 days

In [29]:
data.query("days_till_dep<5")["sales_prev_day"].value_counts()

True     4754
False     448
Name: sales_prev_day, dtype: int64

In [30]:
out = logit_estimate(formulas, data, days=[0,5])

Optimization terminated successfully.
         Current function value: 0.545135
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.544982
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.541076
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.539536
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.538468
         Iterations 6


In [31]:
stargazer = Stargazer(list(out.values()))
stargazer.rename_covariates({"Intercept": "Constant"})
output_1 = HTML(stargazer.render_html())

html = output_1.data
with open("regression_outputs/logit_output_30days.html", "w") as f:
    f.write(html)
HTML(stargazer.render_html())

0,1,2,3,4,5
,,,,,
,Dependent variable:price_increase,Dependent variable:price_increase,Dependent variable:price_increase,Dependent variable:price_increase,Dependent variable:price_increase
,,,,,
,(1),(2),(3),(4),(5)
,,,,,
Constant,-4.910***,-8.271***,-8.430***,-8.478***,-8.736***
,(0.802),(3.041),(3.049),(3.058),(3.065)
days_since,0.101***,0.221**,0.223**,0.226**,0.220**
,(0.030),(0.108),(0.109),(0.109),(0.109)
days_since:loading_factor,,-0.199,-0.202,-0.211,-0.204


30-25 days

In [34]:
out = logit_estimate(formulas, data, days=[5,30])

Optimization terminated successfully.
         Current function value: 0.289634
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.289633
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.289342
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.287715
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.285759
         Iterations 7


In [35]:
stargazer = Stargazer(list(out.values()))
stargazer.rename_covariates({"Intercept": "Constant"})
output_1 = HTML(stargazer.render_html())

html = output_1.data
with open("regression_outputs/logit_output_30days.html", "w") as f:
    f.write(html)
HTML(stargazer.render_html())

0,1,2,3,4,5
,,,,,
,Dependent variable:price_increase,Dependent variable:price_increase,Dependent variable:price_increase,Dependent variable:price_increase,Dependent variable:price_increase
,,,,,
,(1),(2),(3),(4),(5)
,,,,,
Constant,-4.065***,-4.036***,-3.905***,-3.798***,-4.124***
,(0.119),(0.209),(0.219),(0.218),(0.220)
days_since,0.018***,0.016,0.016,0.016,0.015
,(0.005),(0.012),(0.012),(0.012),(0.012)
days_since:loading_factor,,0.005,0.006,0.005,-0.003


## Fixed effect

In [39]:
from linearmodels import PanelOLS

In [38]:
panel_data = data.set_index(["train_id","days_till_dep"])

In [41]:
data.columns

Index(['train_id', 'days_till_dep', 'mean_econ_price', 'mean_first_price',
       'max_econ_seats', 'min_econ_seats', 'mean_econ_seats',
       'max_first_seats', 'min_first_seats', 'mean_first_seats', 'departure',
       'duration', 'econ_seats_sold', 'first_seats_sold', 'weekday',
       'depart_month', 'depart_hour', 'day_part', 'price_change',
       'price_change_percentage', 'price_change_direction', 'price_increase',
       'seats_sold_prev_day', 'sales_prev_day', 'max_seat_capacity',
       'capacity', 'loading_factor', 'route', 'peak_hour', 'days_since',
       'loading_factor_median', 'loading_factor_cat'],
      dtype='object')

In [96]:
fe_lm = PanelOLS.from_formula(
    "price_change_percentage ~ 1 + loading_factor + TimeEffects + EntityEffects",
    panel_data,
    #weights=
).fit(cov_type="clustered", cluster_entity=True)

fe_lm.summary

0,1,2,3
Dep. Variable:,price_change_percentage,R-squared:,0.0103
Estimator:,PanelOLS,R-squared (Between):,-0.5153
No. Observations:,30336,R-squared (Within):,0.0139
Date:,"Mon, Dec 14 2020",R-squared (Overall):,-0.0029
Time:,15:40:01,Log-likelihood,5.463e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,302.82
Entities:,1209,P-value,0.0000
Avg Obs:,25.092,Distribution:,"F(1,29096)"
Min Obs:,13.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,-0.0205,0.0018,-11.635,0.0000,-0.0240,-0.0171
loading_factor,0.0629,0.0045,13.868,0.0000,0.0540,0.0718


In [97]:
fe_lm = PanelOLS.from_formula(
    "price_change_percentage ~ 1 + loading_factor + TimeEffects + EntityEffects",
    panel_data.query("days_till_dep >5"),
    # weights=
).fit(cov_type="clustered", cluster_entity=True)

fe_lm.summary

  if is_categorical(s):


0,1,2,3
Dep. Variable:,price_change_percentage,R-squared:,0.0060
Estimator:,PanelOLS,R-squared (Between):,-0.6022
No. Observations:,23976,R-squared (Within):,-0.0140
Date:,"Mon, Dec 14 2020",R-squared (Overall):,-0.0430
Time:,15:40:12,Log-likelihood,4.49e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,137.36
Entities:,1209,P-value,0.0000
Avg Obs:,19.831,Distribution:,"F(1,22742)"
Min Obs:,8.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,-0.0263,0.0036,-7.2416,0.0000,-0.0334,-0.0192
loading_factor,0.0817,0.0105,7.7448,0.0000,0.0610,0.1024


In [60]:
panel_data.columns

Index(['mean_econ_price', 'mean_first_price', 'max_econ_seats',
       'min_econ_seats', 'mean_econ_seats', 'max_first_seats',
       'min_first_seats', 'mean_first_seats', 'departure', 'duration',
       'econ_seats_sold', 'first_seats_sold', 'weekday', 'depart_month',
       'depart_hour', 'day_part', 'price_change', 'price_change_percentage',
       'price_change_direction', 'price_increase', 'seats_sold_prev_day',
       'sales_prev_day', 'max_seat_capacity', 'capacity', 'loading_factor',
       'route', 'peak_hour', 'days_since', 'loading_factor_median',
       'loading_factor_cat'],
      dtype='object')

In [108]:
fe_lm = PanelOLS.from_formula(
    "price_change_percentage ~ 1 + loading_factor + EntityEffects + TimeEffects",
    panel_data.query("days_till_dep <=5"),
).fit(cov_type="clustered", cluster_entity=True)

fe_lm.summary

#F-test for Poolability: 0.9034
#P-value: 0.9867
#ki kéne venni az entity effectet?

  if is_categorical(s):


0,1,2,3
Dep. Variable:,price_change_percentage,R-squared:,0.0128
Estimator:,PanelOLS,R-squared (Between):,-0.2876
No. Observations:,6360,R-squared (Within):,0.0210
Date:,"Mon, Dec 14 2020",R-squared (Overall):,-0.0288
Time:,15:47:02,Log-likelihood,1.074e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,66.875
Entities:,1209,P-value,0.0000
Avg Obs:,5.2605,Distribution:,"F(1,5145)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,-0.0452,0.0107,-4.2412,0.0000,-0.0661,-0.0243
loading_factor,0.1022,0.0191,5.3596,0.0000,0.0648,0.1396


In [106]:
fe_lm = PanelOLS.from_formula(
    "price_change_percentage ~ 1 + loading_factor + sales_prev_day + peak_hour + weekday + route + TimeEffects",
    panel_data.query("days_till_dep <=5"),
).fit(cov_type="clustered", cluster_entity=True)

fe_lm.summary

  if is_categorical(s):


0,1,2,3
Dep. Variable:,price_change_percentage,R-squared:,0.0208
Estimator:,PanelOLS,R-squared (Between):,0.0842
No. Observations:,6360,R-squared (Within):,0.0123
Date:,"Mon, Dec 14 2020",R-squared (Overall):,0.0246
Time:,15:44:33,Log-likelihood,1.018e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,8.4208
Entities:,1209,P-value,0.0000
Avg Obs:,5.2605,Distribution:,"F(16,6338)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,-0.0147,0.0041,-3.5780,0.0003,-0.0227,-0.0066
sales_prev_day[T.True],0.0094,0.0025,3.7350,0.0002,0.0045,0.0144
weekday[T.1],0.0035,0.0018,1.9298,0.0537,-5.504e-05,0.0070
weekday[T.2],-0.0010,0.0017,-0.5816,0.5609,-0.0043,0.0024
weekday[T.3],-0.0013,0.0020,-0.6677,0.5044,-0.0053,0.0026
weekday[T.4],0.0030,0.0022,1.3795,0.1678,-0.0013,0.0072
weekday[T.5],0.0035,0.0023,1.4823,0.1383,-0.0011,0.0080
weekday[T.6],-0.0060,0.0023,-2.6824,0.0073,-0.0104,-0.0016
route[T.CORDOBAMADRID],0.0024,0.0021,1.1735,0.2406,-0.0016,0.0064
