In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt


In [9]:
data = pd.read_csv("../data/preprocessed_data.csv")

In [10]:
data.columns

Index(['Airline', 'Source', 'Destination', 'Duration', 'stops', 'class',
       'depature time', 'arrival time', 'Price', 'Date', 'Season'],
      dtype='object')

# Distribution des vols

In [11]:

airline_counts = data['Airline'].value_counts()

fig = px.bar(airline_counts, x=airline_counts.index, y=airline_counts.values,
             labels={'Airline': 'Compagnies aériennes', 'y': 'Nombre de vols'},
             title='Nombre des vols par compagnie aérienne',
             template='plotly_dark',  
             color_discrete_sequence=['lightblue'])  

fig.update_layout(xaxis_tickangle=-90)

fig.show()


In [12]:
airline_counts = data['Airline'].value_counts()

threshold_percentage = 100

major_airlines = airline_counts[airline_counts >= threshold_percentage]
other_airlines_count = airline_counts[airline_counts < threshold_percentage].sum()

grouped_airlines = pd.concat([major_airlines, pd.Series({'Other': other_airlines_count})])

fig = px.pie(
    values=grouped_airlines,
    names=grouped_airlines.index,
    title='Distribution des compagnies aériennes dans le jeux de données ',
    hole=0.3,  
    labels={'Other': 'Other Airlines'},  
    template='plotly_dark',  
)

fig.show()

# Distribution des escales

In [15]:
stop_counts = data['stops'].value_counts()
fig = px.bar(stop_counts, x=stop_counts.index, y=stop_counts.values,
             labels={'stops': 'Les escales', 'y': 'Distribution'},
             title='Nombres des escales par vol',
             template='plotly_dark', 
             color_discrete_sequence=['lightblue'])  


fig.update_xaxes(tickmode='linear', dtick=1)
fig.show()


# Dashboard des prix 

In [None]:
bar_chart = data.groupby(['Airline', 'class'])['Price'].mean().reset_index()

fig = px.bar(bar_chart, x='Airline', y='Price', color='class',
             labels={'Airline': 'Compagnie aériennes', 'Price': 'Prix moyen', 'class': 'Classe'},
             title="Tarifs moyens des billets pour différentes compagnies aériennes selon la classe",
             template='plotly_dark')

fig.show()

In [None]:
import plotly.express as px
bar_chart = data.groupby(['class', 'Season'])['Price'].mean().reset_index()
fig = px.bar(bar_chart, x='class', y='Price', color='Season',
             labels={'class': 'Classe de vol', 'Price': 'Prix moyen', 'Season': 'Saison'},
             title='Prix moyen par classe de vol et saison',
             template='plotly_dark', barmode='group')

fig.show()


In [None]:
data['Date'] = pd.to_datetime(data['Date'])


line_chart = data.resample('M', on='Date')['Price'].mean().reset_index()

line_chart_fig = px.line(line_chart, x='Date', y='Price', title='Prix moyen tout au long de l\'année',template="plotly_dark",
                         labels={'Date': 'Date', 'Price': 'Prix moyen'})
line_chart_fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y"
)
line_chart_fig.show()

In [None]:
fig = px.box(data, 
             x="depature time", 
             y="Price", 
             color="Season", 
             facet_col="arrival time", 
             title="Répartition des prix par heure de départ, heure d’arrivée et saison",
             labels={"Prix": "Prix", "Heure de départ": "Heure de départ", "Heure d'arrivée": "Heure d'arrivée", "Saison": "Saison"},
             category_orders={"Heure de départ": sorted(data['depature time'].unique()), 
                              "Heure d'arrivée": sorted(data['arrival time'].unique()),
                              "Saison": sorted(data['Season'].unique())},template="plotly_dark")

fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ),
    width=1045  
)

fig.show()

In [None]:
fig = px.scatter(bar_chart, x='Source', y='Price', size='Price', color='Destination',
                 labels={'Source': 'Source', 'Price': 'Prix Moyen', 'Destination': 'Destination'},
                 title='Comparer les prix des vols pour différentes routes',
                 template='plotly_dark')

fig.show()


In [33]:
fig = px.sunburst(data, 
                  path=['Source', 'Destination'], 
                  values='Duration',
                  title='Répartition des durées de vol en fonction de la source et de la destination',
                  labels={'Source': 'Source', 'Duration': 'Flight Duration', 'Destination': 'Destination'},
                  template='plotly_dark')

fig.show()

# Stops Analysis Dashboard

In [None]:
# Assuming your DataFrame is named 'df'
fig_avg_price_duration_stops = px.scatter(data, x='Duration', y='Price', color='stops',
                                           labels={'Duration': 'Durée', 'Price': 'Prix moyen', 'stops': 'Nombre des escales'},
                                           title='Average Price Based on Duration and Stops',template="plotly_dark")

# Show the plot
fig_avg_price_duration_stops.show()

# Route Mapping 

In [16]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

# Function to perform batch geocoding with caching
def batch_geocode_with_cache(locations, geolocator, cache):
    results = []

    for location in locations:
        if location in cache:
            result = cache[location]
        else:
            try:
                result = geolocator.geocode(location)
                cache[location] = result
            except GeocoderTimedOut:
                result = None

        results.append(result)

    return results

# Assuming 'Source' and 'Destination' are the columns in your dataframe that contain the airport names
data['Source_lat'] = ""
data['Source_lon'] = ""
data['Destination_lat'] = ""
data['Destination_lon'] = ""

# Initialize geolocator and cache
geolocator = Nominatim(user_agent="myApp")
cache = {}

# Batch geocode source and destination
source_locations = data['Source'].tolist()
destination_locations = data['Destination'].tolist()

source_results = batch_geocode_with_cache(source_locations, geolocator, cache)
destination_results = batch_geocode_with_cache(destination_locations, geolocator, cache)

# Update dataframe with results
data['Source_lat'] = [result.latitude if result else None for result in source_results]
data['Source_lon'] = [result.longitude if result else None for result in source_results]
data['Destination_lat'] = [result.latitude if result else None for result in destination_results]
data['Destination_lon'] = [result.longitude if result else None for result in destination_results]


In [28]:
import plotly.graph_objects as go

# Choisissez une seule source pour laquelle vous souhaitez afficher les prix des trajets
source_selected = 'PAR'  # Remplacez 'Source1' par la source que vous souhaitez sélectionner

# Filtrez les données pour cette source spécifique
selected_data = data[data['Source'] == source_selected]

# Créez une nouvelle figure
fig = go.Figure()

# Ajoutez un graphique de dispersion pour les emplacements de source
fig.add_trace(
    go.Scattergeo(
        lon=selected_data['Source_lon'],
        lat=selected_data['Source_lat'],
        mode='markers',
        marker=dict(
            size=8,
            opacity=0.8,
            color='blue'
        ),
        text=selected_data.apply(lambda row: f"Prix: {row['Price']}", axis=1),  # Ajoutez les prix comme texte
        name='Source'
    )
)

# Ajoutez un graphique de dispersion pour les emplacements de destination
fig.add_trace(
    go.Scattergeo(
        lon=selected_data['Destination_lon'],
        lat=selected_data['Destination_lat'],
        mode='markers',
        marker=dict(
            size=8,
            opacity=0.8,
            color='red'
        ),
        text=selected_data.apply(lambda row: f"Prix: {row['Price']}", axis=1),  # Ajoutez les prix comme texte
        name='Destination'
    )
)

# Ajoutez un graphique de ligne pour représenter les trajets
fig.add_trace(
    go.Scattergeo(
        lon=selected_data['Destination_lon'].tolist() + [selected_data['Source_lon'].iloc[0]],
        lat=selected_data['Destination_lat'].tolist() + [selected_data['Source_lat'].iloc[0]],
        mode='lines',
        line=dict(width=2, color='black'),  # Paramètres pour la ligne du trajet
        hoverinfo='none',
        showlegend=False
    )
)

# Mettez à jour la mise en page de la figure
fig.update_layout(
    title_text=f'Prix des trajets depuis une destination vers toutes les destinations',
    showlegend=True,
    geo=dict(
        resolution=50,
        showland=True,
        showlakes=True,
        landcolor='rgb(204, 204, 204)',
        countrycolor='rgb(204, 204, 204)',
        lakecolor='rgb(255, 255, 255)',
        projection_type='equirectangular',
        coastlinewidth=2,
        lataxis=dict(
            range=[20, 60],
            showgrid=True,
            dtick=10
        ),
        lonaxis=dict(
            range=[-100, 20],
            showgrid=True,
            dtick=20
        ),
    )
)

# Affichez la figure
fig.show()
