# Data Visualization

### Setting-up

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import matplotlib.dates as mdates
import datetime

%run data_cleaning.ipynb

In [None]:
nyc = df1
sao = df2

In [None]:
nyc['tot_duration'] = nyc["out_duration"] + nyc["in_duration"]
sao['tot_duration'] = sao["out_duration"] + sao["in_duration"]

In [None]:
month = "May"

In [None]:
nyc['date'] = pd.to_datetime(nyc['day_scrap'].astype(str) + ' ' + month + ' ' +nyc['hour_scrap'].astype(str), format='%d %b %H')
nyc['date']= nyc['date'].dt.strftime('%d %b %H h')

In [None]:
nyc.columns

In [None]:
nyc

In [None]:
sao['date'] = pd.to_datetime(sao['day_scrap'].astype(str) + ' ' + month + ' ' +sao['hour_scrap'].astype(str), format='%d %b %H')
sao['date']= sao['date'].dt.strftime('%d %b %H h')

In [None]:
swiss_air_nyc = nyc.loc[nyc["in_airline_company"] != "Lufthansa"]
lufthansa_air_nyc = nyc.loc[nyc["in_airline_company"] == "Lufthansa"]

In [None]:
swiss_air_sao = sao.loc[sao["in_airline_company"] != "Lufthansa"]
lufthansa_air_sao = sao.loc[sao["in_airline_company"] == "Lufthansa"]

in_airline_company and out_airline_company have the same values for a given row.
To describe either in_airline_company or out_air_line company is the same. This gave us:
* the count (the lenght of the dataframe rows),
* number of unique values (2: Swiss or Lufthansa), 
* the top (the unique value which is the most frequent in the dataframe) and 
* the frequency (which is the number of frequecy of the top).

In [None]:
nyc["in_airline_company"].describe()

In [None]:
sao["in_airline_company"].describe()

In [None]:
nyc["ticket_price"].describe()

In [None]:
swiss_air_nyc["ticket_price"].describe()

In [None]:
lufthansa_air_nyc["ticket_price"].describe()

In [None]:
swiss_air_sao["ticket_price"].describe()

In [None]:
lufthansa_air_sao["ticket_price"].describe()

## Line Plot

In [None]:
def plot_df(df, month = month):
    test = df.groupby(['day_scrap', 'hour_scrap'])['ticket_price'].agg(['min', 'mean', 'max'])
    test = test.reset_index()
    test['date'] = pd.to_datetime(test['day_scrap'].astype(str) + ' ' + month + ' ' + test['hour_scrap'].astype(str), format='%d %b %H')

    plt.fill_between(test['date'], test['min'], test['max'], alpha=0.3)
    plt.xticks(rotation=90)
    plt.plot(test['date'], test['mean'], "-s")
    plt.ylim(500, 2500)
    # plt.yticks(range(700, 1300, 50))
    plt.xlabel("Date")
    plt.ylabel("Price [€]")

    ax = plt.gca()
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H h'))

    plt.show()

In [None]:
def line_plot(df=nyc):
    plot_data = df[["ticket_price", "tot_duration", "out_airline_company"]]
    sns.lineplot(data=plot_data, x='tot_duration', y='ticket_price', hue='out_airline_company', estimator='mean')

1. Line plots for NYC destination

In [None]:
plot_df(df = swiss_air_nyc)

In [None]:
plot_df(df = lufthansa_air_nyc)

In [None]:
line_plot()

2. Line plots for São Paulo destination

In [None]:
plot_df(df = swiss_air_sao)

In [None]:
plot_df(df = lufthansa_air_sao)

In [None]:
line_plot(sao)

## Bar Chart

In [None]:
def plot_data(df=nyc):
    df['destination'] = df['arr_city'].replace({'JFK': 'New York City', 'GRU': 'São Paulo'})
    df["airline_company"] = df["out_airline_company"]
    data = df[["date", "ticket_price", "airline_company", "tot_duration", "destination"]]
    data = data.sort_values(by='date', ascending = False)

    return data

In [None]:
def bar_chart(plot_data = plot_data()):
    sns.barplot(data = plot_data, x='ticket_price', y='date', hue='airline_company',estimator='mean', capsize=.4, errcolor=".5",
    linewidth=3, edgecolor=".5")

In [None]:
df_merge = pd.concat([plot_data(nyc), plot_data(sao)])

1. NYC

In [None]:
bar_chart(plot_data(nyc))

2. São Paulo

In [None]:
bar_chart(plot_data(sao))

## Scatter Plot

In [None]:
def scatter_plot(data = plot_data()):
    sns.scatterplot(x=data["ticket_price"], y=data["airline_company"])

1. NYC

In [None]:
scatter_plot(plot_data(nyc))

2. São Paulo

In [None]:
scatter_plot(plot_data(sao))

In [None]:
sns.scatterplot(data=df_merge, x='ticket_price', y='airline_company', hue='destination')

## Heatmap

In [None]:
def heatmap(data = plot_data(), y_axis : str= "date", x_axis : str= 'airline_company', val: str= 'ticket_price'):
    data_pivot = data.pivot_table(index = y_axis , columns= x_axis, values= val, aggfunc='mean')
    data_pivot.sort_index(ascending=False, inplace=True)

    sns.heatmap(data_pivot)

1. NYC

In [None]:
heatmap(plot_data(nyc))

2. Sao

In [None]:
heatmap(plot_data(sao))

3. Merge dataframes nyc and sao

In [None]:
heatmap(data= df_merge, y_axis="airline_company", x_axis="destination")