In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# suppression d'un warning gênant.
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
## Chargement des données brutes
df = pd.read_json('../data/fizzy.json', lines=True, dtype=False)

# Start cleaning
df['actualCostEth'] = pd.to_numeric('0.' + df.actualCost.str.split(' ').str[0])
df['actualCostUsd'] = pd.to_numeric(df.actualCost.str.split(' ').str[2].str.slice(2,6,1))
df['blockHeight'] = pd.to_numeric(df.blockHeight)
df['gasLimit'] = pd.to_numeric(df.gasLimit)
df['gasPrice'] = pd.to_numeric('0.' + df.gasPrice.str.split(' ').str[0])
df['gasUsed'] = pd.to_numeric(df.gasUsed.str.split(' ').str[0])
df['timeStamp'] = df.timeStamp.str.extract('\((.*)\)')
df['timeStamp'] = pd.to_datetime(df.timeStamp, format='%b-%d-%Y %I:%M:%S %p +%Z')
df['inputData'] = df.inputData.str.replace('[\\r|\\n]', '')
df['functionCalled'] = df.inputData.str.extract('Function: (.*)\(')
df['flightId'] = df.inputData.str.extract('[addNewInsurance|updateFlightStatus|manualInsuranceResolution].*\[0\]:\s\s(.{64})')
df['flightNo'] = df.flightId.str.decode('hex') \
                            .str.decode('utf-8') \
                            .str.rsplit('.',n=1) \
                            .str.get(0) \
                            .str.replace('.','')
df['departureDate'] = pd.to_datetime(df.flightId.str.decode('hex') \
                                     .str.decode('utf-8') \
                                     .str.rsplit('.',n=1) \
                                     .str.get(1) \
                                     .str.slice(0,10,1), unit='s')
df['limitArrivalTime'] = df.inputData.str.extract('addNewInsurance.*\[1\]:\s\s(.{64})')
df['limitArrivalTime'] = pd.to_datetime(df.limitArrivalTime.str[-12:].apply(lambda x: None if pd.isnull(x) else int(str(x), 16)),unit='ms')
df['premium'] = df.inputData.str.extract('addNewInsurance.*\[2\]:\s\s(.{64})')
df['premium'] = df.premium.apply(lambda x: None if pd.isnull(x) else int(str(x), 16))
df['indemnity'] = df.inputData.str.extract('addNewInsurance.*\[3\]:\s\s(.{64})')
df['indemnity'] = df.indemnity.apply(lambda x: None if pd.isnull(x) else int(str(x), 16))
df['productId'] = df.inputData.str.extract('(?:addNewInsurance.*\[4\]:\s\s|manualInsuranceResolution.*\[2\]:\s\s)(.{64})')
df['actualArrivalTime'] = df.inputData.str.extract('updateFlightStatus.*\[1\]:\s\s(.{64})')
df['actualArrivalTime'] = pd.to_datetime(df.actualArrivalTime.str[-12:].apply(lambda x: None if pd.isnull(x) else int(str(x), 16)),unit='ms')
df['newStatusId'] = df.inputData.str.extract('manualInsuranceResolution.*\[1\]:\s\s(.{64})')
df['value'] = pd.to_numeric(df.value.str.slice(0,1,1))
df['eventName'] = df.eventName.str[0:-2]
df['eventStatus'] = df.eventParam5.apply(lambda x: None if x == "None" else int(str(x),16))
df['eventIndemnity'] = df[df['eventName'] == "InsuranceUpdate"]['eventParam4'].apply(lambda x: None if x == "None" else int(str(x),16))
del df['txReceiptStatus']
del df['inputData']
del df['actualCost']
del df['eventParam1']
del df['eventParam2']
del df['eventParam3']
del df['eventParam4']
del df['eventParam5']

## Chargement des données aériennes
routes = pd.read_csv('../data/routes.tsv', sep='\t', header=0, dtype=str)
#routes = routes.drop_duplicates(subset='CallSign', keep='first')
airport = pd.read_csv('../data/airports.csv' , header = None, names=['Airport ID','Name','City','Country','IATA','ICAO','Latitude','Longitude','Altitude','Timezone','DST','Tz','Type','Source'], dtype=str)

## Rapprochement des données route et aéroport sur la base des vols dans les données fizzy
# On ajoute à chaque transaction l'identifiant de départ et d'arriver du vol
a = df['flightNo'].to_frame().merge(routes[['CallSign', 'FromAirport_ICAO', 'ToAirport_ICAO']], left_on='flightNo', right_on='CallSign', how='left')
a = a.dropna(axis=0, subset=['CallSign'])

# DF pour les villes de départ
b = pd.concat([a.merge(airport, left_on='FromAirport_ICAO', right_on='IATA'),
             a.merge(airport, left_on='FromAirport_ICAO', right_on='ICAO')], sort=False)
b = b.rename(columns={'flightNo' : 'flightNo',
                             'CallSign' : 'CallSign',
                             'FromAirport_ICAO' : 'FromAirport_ICAO',
                             'ToAirport_ICAO' : 'ToAirport_ICAO',
                             'Airport ID' : 'Airport ID',
                             'Name' : 'Name',
                             'City' : 'fromCity',
                             'Country' : 'fromCountry',
                             'IATA' : 'IATA',
                             'ICAO' : 'ICAO',
                             'Latitude' : 'fromLat',
                             'Longitude' : 'fromLong',
                             'Altitude' : 'fromAlt',
                             'Timezone' : 'Timezone',
                             'DST' : 'DST',
                             'Tz' : 'Tz',
                             'Type' : 'Type',
                             'Source' : 'Source'})
b = b.dropna(axis=0, subset=['fromCity'])
b = b.drop_duplicates(keep='first')

# DF pour les villes d'arrivé
c = pd.concat([a.merge(airport, left_on='ToAirport_ICAO', right_on='IATA'),
             a.merge(airport, left_on='ToAirport_ICAO', right_on='ICAO')], sort=False)
c = c.rename(columns={'flightNo' : 'flightNo',
                             'CallSign' : 'CallSign',
                             'FromAirport_ICAO' : 'FromAirport_ICAO',
                             'ToAirport_ICAO' : 'ToAirport_ICAO',
                             'Airport ID' : 'Airport ID',
                             'Name' : 'Name',
                             'City' : 'toCity',
                             'Country' : 'toCountry',
                             'IATA' : 'IATA',
                             'ICAO' : 'ICAO',
                             'Latitude' : 'toLat',
                             'Longitude' : 'toLong',
                             'Altitude' : 'toAlt',
                             'Timezone' : 'Timezone',
                             'DST' : 'DST',
                             'Tz' : 'Tz',
                             'Type' : 'Type',
                             'Source' : 'Source'})
c = c.dropna(axis=0, subset=['toCity'])
c = c.drop_duplicates(keep='first')

# merge des deux pour avoir un DF avec arrivé et départ
flights = b[['CallSign', 'fromCity', 'fromLat', 'fromLong']].merge(c[['CallSign', 'toCity', 'toLat', 'toLong' ]], on='CallSign', how='left')
flights = flights.drop_duplicates(keep='first')
flights = flights.reset_index()
del flights['index']
del a,b,c

# Calcul des distances géodésique
from geopy import distance

def flightDist(row):
    return round(float(str(distance.geodesic((row['fromLat'], row['fromLong']), (row['toLat'], row['toLong'])).km)),2)

flights['distance'] = flights.apply(flightDist, axis=1)

# merge avec le DF fizzy
df = df.merge(flights[['CallSign', 'fromCity', 'toCity', 'distance', 'fromLat', 'toLat', 'fromLong', 'toLong']], left_on='flightNo', right_on='CallSign', how='left')
df['fromLat'] = pd.to_numeric(df.fromLat)
df['fromLong'] = pd.to_numeric(df.fromLong)
df['toLat'] = pd.to_numeric(df.toLat)
df['toLong'] = pd.to_numeric(df.toLong)


# Variante avec les données B2B en sélectionnant les transactions concernant les produits souscrit le 26 juillet 2018
rangeB2B = (df['timeStamp'] >= pd.Timestamp('2018-07-26')) & (df['timeStamp'] < pd.Timestamp('2018-07-27'))
dfB2B = df.loc[df['productId'].isin(df[rangeB2B]['productId'])]

# Variante avec les données B2C en excluant les transactions B2B ci-dessus le 26 juillet
dfB2C = df.loc[~df['productId'].isin(df[rangeB2B]['productId'])]


# Afficher les colonnes
print(f'Fizzy : {list(df)}\n'
     f'Routes : {list(routes)}\n'
     f'Airport : {list(airport)}\n'
     f'Flights : {list(flights)}')

# Informations de bases

In [None]:
# Calcul tout un tas d'indicateurs

# Global
loss = df[df['eventStatus'] == 2]['eventIndemnity'].sum()
premium = df[df['eventName'] == 'InsuranceCreation']['premium'].sum()
nbContrat = df['productId'].nunique()
nbOnTime = df[df['eventStatus'] == 1]['eventIndemnity'].count()
nbClaim = df[df['eventStatus'] == 2]['eventIndemnity'].count()
nbUserCancelation = df[df['eventStatus'] == 3]['eventIndemnity'].count()
nbCompanyCancelation = df[df['eventStatus'] == 4]['eventIndemnity'].count()
nbRedirected = df[df['eventStatus'] == 5]['eventIndemnity'].count()
nbDiverted = df[df['eventStatus'] == 6]['eventIndemnity'].count()


medianPremium = df[df['eventName'] == 'InsuranceCreation']['premium'].median()
meanPremium = df[df['eventName'] == 'InsuranceCreation']['premium'].mean()
stdPremium = df[df['eventName'] == 'InsuranceCreation']['premium'].std(ddof=0)
minPremium = df[df['eventName'] == 'InsuranceCreation']['premium'].min()
maxPremium = df[df['eventName'] == 'InsuranceCreation']['premium'].max()

medianIndemnity = df[df['eventName'] == 'InsuranceCreation']['indemnity'].median()
meanIndemnity = df[df['eventName'] == 'InsuranceCreation']['indemnity'].mean()
stdIndemnity = df[df['eventName'] == 'InsuranceCreation']['indemnity'].std(ddof=0)
minIndemnity = df[df['eventName'] == 'InsuranceCreation']['indemnity'].min()
maxIndemnity = df[df['eventName'] == 'InsuranceCreation']['indemnity'].max()

# B2B
lossB2B = dfB2B[dfB2B['eventStatus'] == 2]['eventIndemnity'].sum()
premiumB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['premium'].sum()
nbContratB2B = dfB2B['productId'].nunique()
nbClaimB2B = dfB2B[dfB2B['eventStatus'] == 2]['eventIndemnity'].count()

medianPremiumB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['premium'].median()
meanPremiumB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['premium'].mean()
stdPremiumB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['premium'].std(ddof=0)
minPremiumB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['premium'].min()
maxPremiumB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['premium'].max()

medianIndemnityB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['indemnity'].median()
meanIndemnityB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['indemnity'].mean()
stdIndemnityB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['indemnity'].std(ddof=0)
minIndemnityB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['indemnity'].min()
maxIndemnityB2B = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['indemnity'].max()

# B2C
lossB2C = dfB2C[dfB2C['eventStatus'] == 2]['eventIndemnity'].sum()
premiumB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['premium'].sum()
nbContratB2C = dfB2C['productId'].nunique()
nbClaimB2C = dfB2C[dfB2C['eventStatus'] == 2]['eventIndemnity'].count()

medianPremiumB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['premium'].median()
meanPremiumB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['premium'].mean()
stdPremiumB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['premium'].std(ddof=0)
minPremiumB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['premium'].min()
maxPremiumB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['premium'].max()

medianIndemnityB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['indemnity'].median()
meanIndemnityB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['indemnity'].mean()
stdIndemnityB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['indemnity'].std(ddof=0)
minIndemnityB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['indemnity'].min()
maxIndemnityB2C = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['indemnity'].max()

blockchainCost = df['actualCostUsd'].sum()

# on les affiches
message = (
    f"Cout des sinistres : \t Global : {loss!s}€ \t B2B : {lossB2B!s}€ \t B2C : {lossB2C!s}€\n"
    f"Primes perçues :\t Global : {premium!s}€ \t B2B : {premiumB2B!s}€ \t B2C : {premiumB2C!s}€\n"
    f"S/P : \t\t\t Global : {round(loss/premium*100,2)!s}% \t B2B : {round(lossB2B/premiumB2B*100,2)!s}% \t\t B2C : {round(lossB2C/premiumB2C*100,2)!s}%\n"
    f"Nombre de contrats :\t Global : {nbContrat!s} \t B2B : {nbContratB2B!s} \t\t B2C : {nbContratB2C!s}\n"
    f"Nombre de sinistre :\t Global : {nbClaim!s} \t\t B2B : {nbClaimB2B!s} \t\t B2C : {nbClaimB2C!s}\n"
    f"===========\n"
    f"Prime médiane :\t\t Global : {medianPremium!s}€ \t\t B2B : {medianPremiumB2B!s}€ \t\t B2C : {medianPremiumB2C!s}€\n"
    f"Prime moyenne :\t\t Global : {round(meanPremium,2)!s}€ \t B2B : {round(meanPremiumB2B,)!s}€ \t\t B2C : {round(meanPremiumB2C,2)!s}€\n"
    f"Ecart type :\t\t Global : {round(stdPremium,2)!s}€ \t B2B : {round(stdPremiumB2B,2)!s}€ \t\t B2C : {round(stdPremiumB2C,2)!s}€\n"
    f"Prime minimum :\t\t Global : {minPremium!s}€ \t\t B2B : {minPremiumB2B!s}€ \t\t B2C : {minPremiumB2C!s}€\n"
    f"Prime maximum :\t\t Global : {maxPremium!s}€ \t B2B : {maxPremiumB2B!s}€ \t\t B2C : {maxPremiumB2C!s}€\n"
    f"===========\n"
    f"Garantie médiane :\t Global : {medianIndemnity!s}€ \t B2B : {medianIndemnityB2B!s}€ \t\t B2C : {medianIndemnityB2C!s}€\n"
    f"Garantie moyenne :\t Global : {round(meanIndemnity,2)!s}€ \t B2B : {round(meanIndemnityB2B,2)!s}€ \t\t B2C : {round(meanIndemnityB2C,2)!s}€\n"
    f"Ecart type :\t\t Global : {round(stdIndemnity,2)!s}€ \t B2B : {round(stdIndemnityB2B,2)!s}€ \t\t B2C : {round(stdIndemnityB2C,2)!s}€\n"
    f"Garantie minimum :\t Global : {minIndemnity!s}€ \t B2B : {minIndemnityB2B!s}€ \t\t B2C : {minIndemnityB2C!s}€\n"
    f"Garantie maximum :\t Global : {maxIndemnity!s}€ \t B2B : {maxIndemnityB2B!s}€ \t\t B2C : {maxIndemnityB2C!s}€\n"
    f"===========\n"
    f"Nombre de vols à l'heure : {nbOnTime}\n"
    f"Nombre de vols en retard : {nbClaim}\n"
    f"Nombre d'annulation client : {nbUserCancelation}\n"
    f"Nombre d'annulation companie : {nbCompanyCancelation}\n"
    f"Nombre de vol redirigé : {nbRedirected}\n"
    f"Nombre de vol détourné : {nbDiverted}\n"
    f"===========\n"
    f"Cout blockchain : {round(blockchainCost,2)!s}$"
)
print(message)

# Souscriptions au cour du temps

In [None]:
from datetime import datetime, timedelta

plt.figure(num=None, figsize=(18, 8), dpi=150, facecolor='w', edgecolor='k')

s = df[df['eventName'] == 'InsuranceCreation']['timeStamp']
ts = s.groupby(s.dt.floor('D')).size().reset_index(name='count')

plt.plot(ts['timeStamp'], ts['count'])
plt.title("Souscription à Fizzy au court du temps (B2B + B2C)")
plt.xlabel("date", fontsize=14)
plt.ylabel("nombre de souscription/jour (log)", fontsize=14)
plt.yscale("log")
t = np.arange(ts['timeStamp'].min(), ts['timeStamp'].max(), timedelta(weeks=1)).astype(datetime)
plt.xticks(t, t, rotation='-45', ha='left', rotation_mode="anchor")
plt.grid(True, 'both')
plt.tight_layout()

plt.savefig('insuranceCreationTimestamp.png')

# Vols les plus fréquents

In [None]:
frequentFlight = df[df['functionCalled'] == 'addNewInsurance']['flightNo'].value_counts().to_frame(name='count')
frequentFlight.index.name = 'flightNo'
frequentFlight = frequentFlight.reset_index()
frequentFlight = frequentFlight.merge(flights, left_on='flightNo', right_on='CallSign', how='left')
frequentFlight.loc[:10,['flightNo', 'count', 'fromCity', 'toCity', 'distance']]

# Heatmap des dates assurés

In [None]:
# carte des vols souscrit par date d'arrivée prévu
import calmap
s = df['departureDate']
ts = s.groupby(s.dt.floor('D')).size().reset_index(name='count')
souscriptions = pd.Series(data=ts['count'].tolist(), index=ts['departureDate'].tolist())
fig, ax = calmap.calendarplot(souscriptions, fillcolor='grey', monthticks=1, daylabels='LMMJVSD', 
                    fig_kws=dict(num=None,
                            figsize=(18,8),
                            dpi=150,
                            facecolor="w",
                            edgecolor="red",
                            ),
                    cmap='jet',
                    linewidth=3)
fig.savefig('flightDepartureDateHeatmap.png')
print(f"min : {ts['count'].min()!s} | max : {ts['count'].max()!s}")

In [None]:
# carte des vols souscrit par date d'arrivée prévu et en retard > 2h
claim = df[df['eventStatus'] == 2]['timeStamp']
claimCount = claim.groupby(claim.dt.floor('D')).size().reset_index(name='count')

volRetard = pd.Series(data=claimCount['count'].tolist(), index=claimCount['timeStamp'].tolist())
fig, ax = calmap.calendarplot(volRetard, fillcolor='grey', monthticks=1, daylabels='LMMJVSD', 
                    fig_kws=dict(num=None,
                            figsize=(18,8),
                            dpi=150,
                            facecolor="w",
                            edgecolor="red",
                            ),
                    cmap='jet',
                    linewidth=3)
fig.savefig('claimHeatmap.png')
print(f"min : {claimCount['count'].min()!s} | max : {claimCount['count'].max()!s}")

In [None]:
# carte du ratio des deux carte précédentes

ratioRetard = ts.merge(claimCount, left_on='departureDate', right_on='timeStamp')
#ratioRetard['count_x'] = pd.to_numeric(ratioRetard['count_x'])
ratioRetard['ratio'] = ratioRetard['count_y'] / ratioRetard['count_x'] * 100

ratioSeries = pd.Series(data=ratioRetard['ratio'].tolist(), index=ratioRetard['timeStamp'].tolist())

fig, ax = calmap.calendarplot(ratioSeries, fillcolor='grey', monthticks=1, daylabels='LMMJVSD', 
                    fig_kws=dict(num=None,
                            figsize=(18,8),
                            dpi=150,
                            facecolor="w",
                            edgecolor="red",
                            ),
                    cmap='jet',
                    linewidth=3)
fig.savefig('claimRatioHeatmap.png')

# Vol avec le plus de places assurées par fizzy

In [None]:
volUnique = df[df['eventName'] == 'InsuranceCreation'][['flightNo','departureDate']]
volUnique = volUnique.groupby(['flightNo','departureDate']).size().reset_index(name='sièges').sort_values(by='sièges', ascending=False)
volUnique = volUnique.merge(flights, left_on='flightNo', right_on='CallSign', how='left')
volUnique.loc[:10, ['flightNo', 'departureDate', 'sièges', 'fromCity', 'toCity']]

# Répartition des niveaux de prime

In [None]:
# Global
uniquePremium = df[df['eventName'] == 'InsuranceCreation']['premium'].groupby(df['premium']).size().reset_index(name='size').sort_values(by='premium', ascending=True)
ax = uniquePremium.plot.bar(x='premium', y='size', figsize=(18,8), grid=True, rot=0, title="primes : B2B + B2C", )
plt.xlabel("prime en €", fontsize=14)
plt.ylabel("quantité", fontsize=14)
ax.legend().remove()
plt.tight_layout()
plt.savefig('primesB2BandB2C.png', dpi=150)

#B2B
uniquePremiumb2b = dfB2B[dfB2B['eventName'] == 'InsuranceCreation']['premium'].groupby(dfB2B['premium']).size().reset_index(name='size').sort_values(by='premium', ascending=True)
ax = uniquePremiumb2b.plot.bar(x='premium', y='size', figsize=(18,8), grid=True, rot=0, title="primes : B2B")
plt.xlabel("prime en €", fontsize=14)
plt.ylabel("quantité", fontsize=14)
ax.legend().remove()
plt.tight_layout()
plt.savefig('primesB2B.png', dpi=150)

#B2C
uniquePremiumb2c = dfB2C[dfB2C['eventName'] == 'InsuranceCreation']['premium'].groupby(dfB2C['premium']).size().reset_index(name='size').sort_values(by='premium', ascending=True)
ax = uniquePremiumb2c.plot.bar(x='premium', y='size', figsize=(18,8), grid=True, rot=0, title="primes : B2C")
plt.xlabel("prime en €", fontsize=14)
plt.ylabel("quantité", fontsize=14)
ax.legend().remove()
plt.tight_layout()
plt.savefig('primesB2C.png', dpi=150)


# Répartition des garanties

In [None]:
# Global
uniqueIndemnity = df[df.loc[:, 'eventName'] == 'InsuranceCreation']
labels = ["{0}€ - {1}€".format(i, i + 49) for i in range(1, 551, 50)]
uniqueIndemnity['groupe'] = pd.cut(uniqueIndemnity.loc[:, 'indemnity'], range(1,552,50), right=False, labels=labels)
uniqueIndemnity = uniqueIndemnity.groupby(uniqueIndemnity.loc[:, 'groupe']).size().reset_index(name='size').sort_values(by='groupe', ascending=True)
ax = uniqueIndemnity.plot.bar(x='groupe', y='size', figsize=(18,8), grid=True, rot=0, title="Garanties : B2B + B2C")
plt.xlabel("garantie en €", fontsize=14)
plt.ylabel("quantité", fontsize=14)
ax.legend().remove()
plt.tight_layout()
plt.savefig('payoutB2BandB2C.png', dpi=150)

# B2B
uniqueIndemnity = dfB2B[dfB2B.loc[:, 'eventName'] == 'InsuranceCreation']
labels = ["{0}€ - {1}€".format(i, i + 49) for i in range(1, 551, 50)]
uniqueIndemnity['groupe'] = pd.cut(uniqueIndemnity.loc[:, 'indemnity'], range(1,552,50), right=False, labels=labels)
uniqueIndemnity = uniqueIndemnity.groupby(uniqueIndemnity.loc[:, 'groupe']).size().reset_index(name='size').sort_values(by='groupe', ascending=True)
ax = uniqueIndemnity.plot.bar(x='groupe', y='size', figsize=(18,8), grid=True, rot=0, title="Garanties : B2B")
plt.xlabel("garantie en €", fontsize=14)
plt.ylabel("quantité", fontsize=14)
ax.legend().remove()
plt.tight_layout()
plt.savefig('payoutB2B.png', dpi=150)

# B2C
uniqueIndemnity = dfB2C[dfB2C.loc[:, 'eventName'] == 'InsuranceCreation']
labels = ["{0}€ - {1}€".format(i, i + 49) for i in range(1, 551, 50)]
uniqueIndemnity['groupe'] = pd.cut(uniqueIndemnity.loc[:, 'indemnity'], range(1,552,50), right=False, labels=labels)
uniqueIndemnity = uniqueIndemnity.groupby(uniqueIndemnity.loc[:, 'groupe']).size().reset_index(name='size').sort_values(by='groupe', ascending=True)
ax = uniqueIndemnity.plot.bar(x='groupe', y='size', figsize=(18,8), grid=True, rot=0, title="Garanties : B2C")
plt.xlabel("garantie en €", fontsize=14)
plt.ylabel("quantité", fontsize=14)
ax.legend().remove()
plt.tight_layout()
plt.savefig('payoutB2C.png', dpi=150)

# Global si retard > 2h
uniqueIndemnity = df[df.loc[:, 'eventStatus'] == 2]
labels = ["{0}€ - {1}€".format(i, i + 49) for i in range(1, 551, 50)]
uniqueIndemnity['groupe'] = pd.cut(uniqueIndemnity.loc[:, 'eventIndemnity'], range(1,552,50), right=False, labels=labels)
uniqueIndemnity = uniqueIndemnity.groupby(uniqueIndemnity.loc[:, 'groupe']).size().reset_index(name='size').sort_values(by='groupe', ascending=True)
ax = uniqueIndemnity.plot.bar(x='groupe', y='size', figsize=(18,8), grid=True, rot=0, title="Indemnités payées : B2B + B2C")
plt.xlabel("Indemnité en €", fontsize=14)
plt.ylabel("quantité", fontsize=14)
ax.legend().remove()
plt.tight_layout()
plt.savefig('payoutPaid.png')



In [None]:
# Liste de indemnités versé par montant
uniqueIndemnity = df[df.loc[:, 'eventStatus'] == 2]
uniqueIndemnity = uniqueIndemnity.groupby(uniqueIndemnity.loc[:, 'eventIndemnity']).size().reset_index(name='size').sort_values(by='eventIndemnity', ascending=True)
print(uniqueIndemnity)
#uniqueIndemnity.plot.bar(x='eventIndemnity', y='size', figsize=(18,8), grid=True, rot=0, title="indemnités payés : b2b + b2c")

# Carte du monde

In [None]:
# création d'une carte à partir des donnée long lat du DF.
# On ajoute un fond de carte en projection cylindrique pour que ça se superpose joliment.

import matplotlib.image as mpimg

# fond de carte
world = mpimg.imread('../data/worldMap.jpg')

# transtypage des long lat en format numérique
frequentFlight['fromLat'] = pd.to_numeric(frequentFlight['fromLat'])
frequentFlight['fromLong'] = pd.to_numeric(frequentFlight['fromLong'])
frequentFlight['toLat'] = pd.to_numeric(frequentFlight['toLat'])
frequentFlight['toLong'] = pd.to_numeric(frequentFlight['toLong'])
frequentFlight['distance'] = pd.to_numeric(frequentFlight['distance'])

# restriction aux vols en retard
retardedFlight = df[df.loc[:, 'eventStatus'] == 2]

plt.figure(num=None, figsize=(18, 8), dpi=150, facecolor='w', edgecolor='k')

# carte des aéroport de départ
ax = frequentFlight.plot(kind="scatter", x="fromLong", y="fromLat",
                    s=frequentFlight['count']*4,
                    c=frequentFlight['distance'],
                    cmap=plt.get_cmap("jet"),
                    colorbar=False, alpha=0.5,
                    figsize=(18,8))

# carte des aéroport d'arrivé
frequentFlight.plot(kind="scatter", x='toLong', y='toLat',
                    s=frequentFlight['count']*4,
                    c=frequentFlight['distance'],
                    cmap=plt.get_cmap("jet"),
                    colorbar=False,
                    alpha=0.5,
                    ax = ax)

# carte des aéroport d'arrivé en retard
retardedFlight.plot(kind="scatter", x='toLong', y='toLat',
                    marker='x',
                    color='r',
                    alpha=1,
                    label="arrivé en retard",
                    ax = ax)

plt.imshow(world, extent=[-180.0,180.0,-90.0,90.0], alpha=1)

plt.xlabel("Longitude", fontsize=14)
plt.ylabel("Latitude", fontsize=14)

distance = frequentFlight["distance"]
tick_values = np.linspace(distance.min(), distance.max(), num=6)
cbar = plt.colorbar()
cbar.ax.set_yticklabels([f'{int(v)!s} km' for v in tick_values], fontsize=14)
cbar.set_label('Distance', fontsize=16)

plt.legend(fontsize=14)
plt.tight_layout()
plt.title("Aéroports de départ et d'arrivée", fontsize=14)

plt.savefig('airportMap.png', dpi=200)
plt.savefig('airportMap.svg')

ax.set_xlim(-15,30, auto=True)
ax.set_ylim(35,70, auto=True)
plt.tight_layout()

plt.savefig('EU-airportMap.png', dpi=200)
plt.savefig('EU-airportMap.svg')


In [None]:
# fond de carte
world = mpimg.imread('../data/worldMap.jpg')

plt.figure(num=None, figsize=(18, 8), dpi=150, facecolor='w', edgecolor='k')

# création d'une color map pour assigner une couleur en fonction du nombre de vols
cmap = mpl.cm.get_cmap('jet')
minCount = float(frequentFlight['count'].min())
maxCount = float(frequentFlight['count'].max())
nColor = mpl.colors.Normalize(vmin=minCount, vmax=maxCount)

# on parcours le DF pour créer une ligne pour chaque trajet
for index, row in frequentFlight.iterrows():
    plt.plot([row['fromLong'],row['toLong']], [row['fromLat'],row['toLat']],
             marker='o',markersize=3,
             color=cmap(nColor(row['count'])),
             linewidth=(row['count']/20),
             alpha=0.5)


# fond de carte
plt.imshow(world, extent=[-180.0,180.0,-90.0,90.0], alpha=1)

plt.xlabel("Longitude", fontsize=14)
plt.ylabel("Latitude", fontsize=14)

nbVol = frequentFlight["count"]
tick_values = np.linspace(nbVol.min(), nbVol.max(), num=6)
cbar = plt.colorbar()
cbar.ax.set_yticklabels([f'{int(v)!s} vol(s)' for v in tick_values], fontsize=14)
cbar.set_label('nombre de vols', fontsize=16)

plt.tight_layout()
plt.title("Trajet", fontsize=14)

plt.savefig('travelMap.png', dpi=200)
plt.savefig('travelMap.svg')

In [None]:
# fond de carte
world = mpimg.imread('../data/worldMap.jpg')

plt.figure(num=None, figsize=(18, 8), dpi=150, facecolor='w', edgecolor='k')

# création d'une color map pour assigner une couleur en fonction du nombre de vols
cmap = mpl.cm.get_cmap('jet')
minCount = float(frequentFlight['count'].min())
maxCount = float(frequentFlight['count'].max())
nColor = mpl.colors.Normalize(vmin=minCount, vmax=maxCount)

europeFlights = frequentFlight[frequentFlight['distance'] <= 3000]

# on parcours le DF pour créer une ligne pour chaque trajet
for index, row in europeFlights.iterrows():
    if row['distance'] <= 2000:
        plt.plot([row['fromLong'],row['toLong']], [row['fromLat'],row['toLat']],
                 marker='o',markersize=3,
                 color=cmap(nColor(row['count'])),
                 linewidth=(row['count']/7.5),
                 alpha=0.33)


# fond de carte
plt.imshow(world, extent=[-180.0,180.0,-90.0,90.0], alpha=1)

plt.xlabel("Longitude", fontsize=14)
plt.ylabel("Latitude", fontsize=14)

nbVol = europeFlights["count"]
tick_values = np.linspace(nbVol.min(), nbVol.max(), num=6)
cbar = plt.colorbar()
cbar.ax.set_yticklabels([f'{int(v)!s} vol(s)' for v in tick_values], fontsize=14)
cbar.set_label('nombre de vols', fontsize=16)

plt.title("Trajet", fontsize=14)

plt.xlim(-15,30, auto=True)
plt.ylim(35,70, auto=True)

plt.tight_layout()

plt.savefig('EU-travelMap.png', dpi=150)
plt.savefig('EU-travelMap.svg')