In [None]:
import pandas as pd
import numpy as np

In [None]:
%run data_cleaning.ipynb
%run data_exploration.ipynb

# (Data cleaning)

- Create a column for the booking date and convert to datetime (day_scrap, hour_scrap)

In [None]:
cur_time = datetime.datetime.now()
month = cur_time.strftime('%b')
year = cur_time.strftime('%Y')

In [None]:
dataset['date']=pd.to_datetime(dataset['day_scrap'].astype(str) + ' ' + month + ' ' + year + ' '+ dataset['hour_scrap'].astype(str), format='%d %b %Y %H')

- Bar chart
- Box plot
- Lineplot
- Heatmap

# Data visualization

## Setting-up

In [None]:
from plotnine import *
import matplotlib.pyplot as plt 
# import matplotlib.dates as mdates
# from mizani.formatters import percent_format
import seaborn as sns 
import datetime
import warnings

%matplotlib inline
warnings.filterwarnings("ignore")

- Outliers for ticket_price are removed from the dataset for data visualization, as it may significantly influence statistics.

In [None]:
dataset=dataset.loc[(dataset.ticket_price<6000)].reset_index(drop=True)

- Calculate total duration of round trip

In [None]:
dataset['tot_duration'] = (dataset["out_duration"] + dataset["in_duration"]).reset_index(drop=True)

## a. Bar chart

#### a1. Number of flights by destination and airline company

In [None]:
barplot1 = dataset.groupby(["arr_city", "out_airline_company"]).size().reset_index(name="N")
barplot1["group_count"] = barplot1.groupby("arr_city")["N"].transform("sum")
barplot1['percent']=barplot1['N']/barplot1["group_count"]
barplot1

In [None]:
f1= (ggplot(barplot1, aes(x="arr_city", y="N", fill="out_airline_company"))
+ geom_bar(stat="identity", position="dodge", width=0.6, size=0.5)
+ labs(x="Destination", y="Number of flights", fill="Airline company")
+ scale_x_discrete(labels=["São Paulo", "New York City"])
+ theme_bw()
)
f1

#### a2. Minimum ticket prices based on the day of the week (NYC)

In [None]:
barplot2=dataset.loc[dataset.arr_city=='JFK'].filter(['date','ticket_price','out_airline_company'])

In [None]:
barplot2['day_of_week'] = dataset['date'].dt.strftime('%A')

In [None]:
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
barplot2['day_of_week'] = pd.Categorical(barplot2['day_of_week'], categories=weekday_order, ordered=True)

In [None]:
barplot2

In [None]:
min_prices = barplot2.groupby(['day_of_week','out_airline_company'])['ticket_price'].min().reset_index()

In [None]:
f1= (ggplot(min_prices, aes(x='day_of_week', y="ticket_price", fill='out_airline_company'))
+ geom_bar(stat="identity", position="stack", width=0.8, size=0.9)
+ labs(x="Day of the week", y="Ticket price", fill="Airline")
+ scale_fill_discrete(labels=["Lufthansa", "Swiss"])
+ theme_bw()
)
f1

#### a3. Minimum ticket prices based on the day of the week (Sao paulo)

In [None]:
barplot3=dataset.loc[dataset.arr_city=='GRU'].filter(['date','ticket_price','out_airline_company'])

In [None]:
barplot3['day_of_week'] = dataset['date'].dt.strftime('%A')

In [None]:
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
barplot3['day_of_week'] = pd.Categorical(barplot3['day_of_week'], categories=weekday_order, ordered=True)

In [None]:
barplot3

In [None]:
min_prices = barplot3.groupby(['day_of_week','out_airline_company'])['ticket_price'].min().reset_index()

In [None]:
f1= (ggplot(min_prices, aes(x='day_of_week', y="ticket_price", fill='out_airline_company'))
+ geom_bar(stat="identity", position="stack", width=0.8, size=0.9)
+ labs(x="Day of the week", y="Ticket price", fill="Airline")
+ scale_fill_discrete(labels=["Lufthansa", "Swiss"])
+ theme_bw()
)
f1

## b. boxplot

In [None]:
lufthansa_nyc = nyc.loc[(nyc.out_airline_company == 'Lufthansa')]
swiss_nyc = nyc.loc[(nyc.out_airline_company == 'Swiss')]
lufthansa_sao = sao.loc[(sao.out_airline_company == 'Lufthansa')]
swiss_sao = sao.loc[(sao.out_airline_company == 'Swiss')]

##### b1. boxplots for New York City

In [None]:
# Lufthasa
f2 = (
    ggplot(lufthansa_nyc, aes(y='ticket_price', x='arr_city'))
    + geom_boxplot(color="blue", size=0.5, width=0.1, alpha=0.5)
    + labs(x="Cities", y="Prices")
    + theme_bw()
)
f2

f2 = f2 + stat_boxplot(geom = "errorbar", width = 0.05,  size = 0.5, color = 'blue')
f2

f2 + stat_summary(fun_data="mean_se", geom="point", size=4, color="red", fill="red")

In [None]:
# Swiss
f3 = (
    ggplot(swiss_nyc, aes(y='ticket_price', x='arr_city'))
    + geom_boxplot(color="blue", size=0.5, width=0.1, alpha=0.5)
    + labs(x="Cities", y="Prices")
    + theme_bw()
)
f3

f3 = f3 + stat_boxplot(geom = "errorbar", width = 0.05,  size = 0.5, color = 'blue')
f3

f3 + stat_summary(fun_data="mean_se", geom="point", size=4, color="red", fill="red")

##### b2. boxplots for São Paulo destination

In [None]:
# Lufthasa
f4 = (
    ggplot(lufthansa_sao, aes(y='ticket_price', x='arr_city'))
    + geom_boxplot(color="blue", size=0.5, width=0.1, alpha=0.5)
    + labs(x="Cities", y="Prices")
    + theme_bw()
)
f4

f4 = f4 + stat_boxplot(geom = "errorbar", width = 0.05,  size = 0.5, color = 'blue')
f4

f4 + stat_summary(fun_data="mean_se", geom="point", size=4, color="red", fill="red")

In [None]:
# Swiss
f5 = (
    ggplot(swiss_sao, aes(y='ticket_price', x='arr_city'))
    + geom_boxplot(color="blue", size=0.5, width=0.1, alpha=0.5)
    + labs(x="Cities", y="Prices")
    + theme_bw()
)
f5

f5 = f5 + stat_boxplot(geom = "errorbar", width = 0.05,  size = 0.5, color = 'blue')
f5

f5 + stat_summary(fun_data="mean_se", geom="point", size=4, color="red", fill="red")

## c. Lineplot

In [None]:
lineplot=dataset.filter(['date', 'arr_city','out_airline_company','ticket_price'])
lineplot['day_booking']=lineplot['date'].dt.strftime('%d %b')


In [None]:
lineplot

#### c1. Lineplot for New York City based on booking day

In [None]:
lineplot_nyc=lineplot.loc[(lineplot.arr_city=='JFK')].groupby(['day_booking','out_airline_company'])['ticket_price'].min()
lineplot_nyc=lineplot_nyc.reset_index()

In [None]:
date_nyc = lineplot_nyc[lineplot_nyc.out_airline_company == 'Lufthansa']['day_booking'].tolist()
price_lufthansa_nyc = lineplot_nyc[lineplot_nyc.out_airline_company == 'Lufthansa']['ticket_price'].tolist()
price_swiss_nyc = lineplot_nyc[lineplot_nyc.out_airline_company == 'Swiss']['ticket_price'].tolist()

In [None]:
fig, ax1 = plt.subplots(figsize = (20,15))

ax1.plot(date_nyc, price_lufthansa_nyc, "-s", color = 'k')
plt.xticks(rotation=90)
ax1.set_ylabel("Lufthansa price [€]", color = 'k')
# ax1.fill_between(lineplot_nyc['date'], lineplot_nyc['min'], lineplot_nyc['max'], alpha=0.1)
ax2 = ax1.twinx()
ax2.plot(date_nyc, price_swiss_nyc,"-s",color = "royalblue")
ax2.set_ylabel("Swiss price [€]", color = "royalblue")
# ax2.fill_between(lineplot_nyc['date'], lineplot_nyc['min'], lineplot_nyc['max'], alpha=0.1)

# plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d %b %Y'))

plt.xticks(rotation=90)
plt.title('Flight (average) prices based on booking date');

#### (draft) Lineplot for New York City based on time and day of booking

In [None]:
lineplot_nyc = dataset.loc[(dataset.arr_city=='JFK')].groupby(['out_airline_company','date'])['ticket_price'].agg(['min','mean','max'])

lineplot_nyc = lineplot_nyc.reset_index()

lineplot_nyc['date'] = lineplot_nyc['date'].dt.strftime('%d %b %I %p')

In [None]:
lineplot_nyc.head()

In [None]:
lineplot_nyc.dtypes

In [None]:
# date_nyc = lineplot_nyc[lineplot_nyc.out_airline_company == 'Lufthansa']['date'].tolist()
date_nyc = lineplot_nyc[lineplot_nyc.out_airline_company == 'Lufthansa']['date'].tolist()
price_lufthansa_nyc = lineplot_nyc[lineplot_nyc.out_airline_company == 'Lufthansa']['mean'].tolist()
price_swiss_nyc = lineplot_nyc[lineplot_nyc.out_airline_company == 'Swiss']['mean'].tolist()

In [None]:
fig, ax1 = plt.subplots(figsize = (20,15))

ax1.plot(date_nyc, price_lufthansa_nyc, color = 'k')
plt.xticks(rotation=90)
ax1.set_ylabel("Lufthansa price [€]", color = 'k')
# ax1.fill_between(lineplot_nyc['date'], lineplot_nyc['min'], lineplot_nyc['max'], alpha=0.1)
ax2 = ax1.twinx()
ax2.plot(date_nyc, price_swiss_nyc, color = "royalblue")
ax2.set_ylabel("Swiss price [€]", color = "royalblue")
# ax2.fill_between(lineplot_nyc['date'], lineplot_nyc['min'], lineplot_nyc['max'], alpha=0.1)

# plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d %b %Y'))

plt.xticks(rotation=90)
plt.title('Flight (average) prices based on booking date');

#### c2. Lineplot for São Paulo based on booking day

In [None]:
lineplot_sao=lineplot.loc[(lineplot.arr_city=='GRU')].groupby(['day_booking','out_airline_company'])['ticket_price'].min().reset_index()

In [None]:
date_sao = lineplot_sao[lineplot_sao.out_airline_company == 'Lufthansa']['day_booking'].tolist()
price_lufthansa_sao = lineplot_sao[lineplot_sao.out_airline_company == 'Lufthansa']['ticket_price'].tolist()
price_swiss_sao = lineplot_sao[lineplot_sao.out_airline_company == 'Swiss']['ticket_price'].tolist()

In [None]:
fig, ax1 = plt.subplots(figsize = (20,15))

ax1.plot(date_sao, price_lufthansa_sao, "-s", color = 'k')
plt.xticks(rotation=90)
ax1.set_ylabel("Lufthansa price [€]", color = 'k')
# ax1.fill_between(lineplot_nyc['date'], lineplot_nyc['min'], lineplot_nyc['max'], alpha=0.1)
ax2 = ax1.twinx()
ax2.plot(date_sao, price_swiss_sao,"-s",color = "royalblue")
ax2.set_ylabel("Swiss price [€]", color = "royalblue")
# ax2.fill_between(lineplot_nyc['date'], lineplot_nyc['min'], lineplot_nyc['max'], alpha=0.1)

# plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d %b %Y'))

plt.xticks(rotation=90)
plt.title('Flight (average) prices based on booking date');

#### (draft) Lineplot for São Paulo based on time and day of booking


In [None]:
lineplot_sao = dataset.loc[(dataset.arr_city=='GRU')].groupby(['out_airline_company','date'])['ticket_price'].agg(['mean'])
lineplot_sao = lineplot_sao.reset_index()
# lineplot_sao['date'] = pd.to_datetime(lineplot_sao['day_scrap'].astype(str) + ' ' + month + ' ' + year + ' '+ lineplot_sao['hour_scrap'].astype(str), format='%d %b %Y %H')
lineplot_sao['date'] = lineplot_sao['date'].dt.strftime('%d %b %I %p')

In [None]:
lineplot_sao.head()

In [None]:
date_sao = lineplot_sao[lineplot_sao.out_airline_company == 'Lufthansa']['date'].tolist()
price_lufthansa_sao = lineplot_sao[lineplot_sao.out_airline_company == 'Lufthansa']['mean'].tolist()
price_swiss_sao = lineplot_sao[lineplot_sao.out_airline_company == 'Swiss']['mean'].tolist()

In [None]:
fig, ax1 = plt.subplots(figsize = (20,15))

ax1.plot(date_sao, price_lufthansa_sao, color = 'k')
plt.xticks(rotation=90)
ax1.set_ylabel("Lufthansa price [€]", color = 'k')
ax2 = ax1.twinx()
ax2.plot(date_sao, price_swiss_sao, color = "royalblue")
ax2.set_ylabel("Swiss price [€]", color = "royalblue")
plt.xticks(rotation=90)
plt.title('Flight (average) prices based on booking date');

## d. Scatter plot

##### d1. scatter plot for New York City

In [None]:
scatterplot_nyc = dataset.loc[(dataset.arr_city=='JFK')].groupby(['out_airline_company','tot_duration'])['ticket_price'].mean().reset_index(name='price')
scatterplot_nyc

In [None]:
f6= (ggplot(scatterplot_nyc, aes(x='tot_duration', y='price', color='out_airline_company')) + \
    geom_point() + \
    labs(title='Cost vs duration of the flight',
         x='Flight duration',
         y='Flight Price',
         color='Airline company') + \
    theme_bw()
)
f6

##### d2. scatter plot for São Paulo

In [None]:
scatterplot_sao = dataset.loc[(dataset.arr_city=='GRU')].groupby(['out_airline_company','tot_duration'])['ticket_price'].mean().reset_index(name='price')
scatterplot_sao.head()

In [None]:
f7= (ggplot(scatterplot_sao, aes(x='tot_duration', y='price', color='out_airline_company')) + \
    geom_point() + \
    labs(title='Cost vs duration of the flight',
         x='Flight duration',
         y='Flight Price',
         color='Airline company') + \
    theme_bw()
)
f7

## e. Heatmap

In [None]:
heatmap = dataset[['ticket_price', 'tot_duration','out_dep_time','out_arr_time','in_dep_time','in_arr_time','date','out_stop_num','in_stop_num']]


heatmap['out_dep_time'] = pd.to_datetime(heatmap['out_dep_time'], format='%H:%M')
heatmap['out_arr_time'] = pd.to_datetime(heatmap['out_arr_time'], format='%H:%M')
heatmap['in_dep_time'] = pd.to_datetime(heatmap['out_arr_time'], format='%H:%M')
heatmap['in_arr_time'] = pd.to_datetime(heatmap['out_arr_time'], format='%H:%M')

heatmap

In [None]:
sns.set(rc={'figure.figsize':(16,6)})
correlation_matrix = heatmap.corr().round(2)
sns.heatmap(correlation_matrix, annot=True)
plt.show()