# Traitement de donnees avec polar

In [1]:
import polars as pl 


In [2]:
df = pl.read_csv("data/flights.csv", low_memory=False)
print(df.head())
print(df.shape)


shape: (5, 31)
┌──────┬───────┬─────┬─────────────┬───┬──────────────┬──────────────┬──────────────┬──────────────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ SECURITY_DEL ┆ AIRLINE_DELA ┆ LATE_AIRCRAF ┆ WEATHER_DELA │
│ ---  ┆ ---   ┆ --- ┆ ---         ┆   ┆ AY           ┆ Y            ┆ T_DELAY      ┆ Y            │
│ i64  ┆ i64   ┆ i64 ┆ i64         ┆   ┆ ---          ┆ ---          ┆ ---          ┆ ---          │
│      ┆       ┆     ┆             ┆   ┆ i64          ┆ i64          ┆ i64          ┆ i64          │
╞══════╪═══════╪═════╪═════════════╪═══╪══════════════╪══════════════╪══════════════╪══════════════╡
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null 

# Infos globales (lignes, colonnes, types)

* Ici on voit le type de chaque colonne

In [3]:
print(df.schema)# affiche le type de chaque colonne 
print(df.columns)# affiche les colonnes 


Schema({'YEAR': Int64, 'MONTH': Int64, 'DAY': Int64, 'DAY_OF_WEEK': Int64, 'AIRLINE': String, 'FLIGHT_NUMBER': Int64, 'TAIL_NUMBER': String, 'ORIGIN_AIRPORT': String, 'DESTINATION_AIRPORT': String, 'SCHEDULED_DEPARTURE': Int64, 'DEPARTURE_TIME': Int64, 'DEPARTURE_DELAY': Int64, 'TAXI_OUT': Int64, 'WHEELS_OFF': Int64, 'SCHEDULED_TIME': Int64, 'ELAPSED_TIME': Int64, 'AIR_TIME': Int64, 'DISTANCE': Int64, 'WHEELS_ON': Int64, 'TAXI_IN': Int64, 'SCHEDULED_ARRIVAL': Int64, 'ARRIVAL_TIME': Int64, 'ARRIVAL_DELAY': Int64, 'DIVERTED': Int64, 'CANCELLED': Int64, 'CANCELLATION_REASON': String, 'AIR_SYSTEM_DELAY': Int64, 'SECURITY_DELAY': Int64, 'AIRLINE_DELAY': Int64, 'LATE_AIRCRAFT_DELAY': Int64, 'WEATHER_DELAY': Int64})
['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'WHEELS_ON', 

# Traitement de donnees 

- on va commencer par selectionner les colonnes utiles pour repondre a notre problematique

In [4]:
df = df.filter(pl.col("CANCELLED") == 0)# on recupere tout les vols non annuler 

print("Vols non annulés :", df.height)
print(df.head())


Vols non annulés : 5729195
shape: (5, 31)
┌──────┬───────┬─────┬─────────────┬───┬──────────────┬──────────────┬──────────────┬──────────────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ SECURITY_DEL ┆ AIRLINE_DELA ┆ LATE_AIRCRAF ┆ WEATHER_DELA │
│ ---  ┆ ---   ┆ --- ┆ ---         ┆   ┆ AY           ┆ Y            ┆ T_DELAY      ┆ Y            │
│ i64  ┆ i64   ┆ i64 ┆ i64         ┆   ┆ ---          ┆ ---          ┆ ---          ┆ ---          │
│      ┆       ┆     ┆             ┆   ┆ i64          ┆ i64          ┆ i64          ┆ i64          │
╞══════╪═══════╪═════╪═════════════╪═══╪══════════════╪══════════════╪══════════════╪══════════════╡
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null    

# On va selectionner les colonnes pour les analyses 

In [5]:
col_utiles = [
    "YEAR", "MONTH", "DAY", "DAY_OF_WEEK",
    "AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT",
    "DEPARTURE_DELAY", "ARRIVAL_DELAY",
    "CANCELLED", "CANCELLATION_REASON",
    "AIR_SYSTEM_DELAY", "SECURITY_DELAY",
    "AIRLINE_DELAY", "LATE_AIRCRAFT_DELAY",
    "WEATHER_DELAY", "DISTANCE"
]

df = df.select(col_utiles)

# verification
print(df.shape)
print(df.schema)
print(df.head())

(5729195, 17)
Schema({'YEAR': Int64, 'MONTH': Int64, 'DAY': Int64, 'DAY_OF_WEEK': Int64, 'AIRLINE': String, 'ORIGIN_AIRPORT': String, 'DESTINATION_AIRPORT': String, 'DEPARTURE_DELAY': Int64, 'ARRIVAL_DELAY': Int64, 'CANCELLED': Int64, 'CANCELLATION_REASON': String, 'AIR_SYSTEM_DELAY': Int64, 'SECURITY_DELAY': Int64, 'AIRLINE_DELAY': Int64, 'LATE_AIRCRAFT_DELAY': Int64, 'WEATHER_DELAY': Int64, 'DISTANCE': Int64})
shape: (5, 17)
┌──────┬───────┬─────┬─────────────┬───┬───────────────┬────────────────┬───────────────┬──────────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ AIRLINE_DELAY ┆ LATE_AIRCRAFT_ ┆ WEATHER_DELAY ┆ DISTANCE │
│ ---  ┆ ---   ┆ --- ┆ ---         ┆   ┆ ---           ┆ DELAY          ┆ ---           ┆ ---      │
│ i64  ┆ i64   ┆ i64 ┆ i64         ┆   ┆ i64           ┆ ---            ┆ i64           ┆ i64      │
│      ┆       ┆     ┆             ┆   ┆               ┆ i64            ┆               ┆          │
╞══════╪═══════╪═════╪═════════════╪═══╪═══════════════╪════════

## Voir les valeurs nulles par colonne 

In [6]:
nulls = df.null_count()
for col in nulls.columns:
    print(f"{col:20} ➜ {nulls[col][0]} nulls")


YEAR                 ➜ 0 nulls
MONTH                ➜ 0 nulls
DAY                  ➜ 0 nulls
DAY_OF_WEEK          ➜ 0 nulls
AIRLINE              ➜ 0 nulls
ORIGIN_AIRPORT       ➜ 0 nulls
DESTINATION_AIRPORT  ➜ 0 nulls
DEPARTURE_DELAY      ➜ 0 nulls
ARRIVAL_DELAY        ➜ 15187 nulls
CANCELLED            ➜ 0 nulls
CANCELLATION_REASON  ➜ 5729195 nulls
AIR_SYSTEM_DELAY     ➜ 4665756 nulls
SECURITY_DELAY       ➜ 4665756 nulls
AIRLINE_DELAY        ➜ 4665756 nulls
LATE_AIRCRAFT_DELAY  ➜ 4665756 nulls
WEATHER_DELAY        ➜ 4665756 nulls
DISTANCE             ➜ 0 nulls


# Rassembler YEAR / MONTH / DAY en une seule colonne DATE

In [7]:
df = df.with_columns(
    pl.date(pl.col("YEAR"), pl.col("MONTH"), pl.col("DAY")).alias("DATE")
)
df = df.sort("DATE")# trier par date 

#supp les anciennes colonnes 
df = df.drop(["YEAR", "MONTH", "DAY"])

print(df.schema)




Schema({'DAY_OF_WEEK': Int64, 'AIRLINE': String, 'ORIGIN_AIRPORT': String, 'DESTINATION_AIRPORT': String, 'DEPARTURE_DELAY': Int64, 'ARRIVAL_DELAY': Int64, 'CANCELLED': Int64, 'CANCELLATION_REASON': String, 'AIR_SYSTEM_DELAY': Int64, 'SECURITY_DELAY': Int64, 'AIRLINE_DELAY': Int64, 'LATE_AIRCRAFT_DELAY': Int64, 'WEATHER_DELAY': Int64, 'DISTANCE': Int64, 'DATE': Date})


# Exporter le csv propre

In [8]:
df.write_csv(
    "data/clean/flights_cleaned_polars.csv",
    separator=",",        # séparateur classique CSV
    include_header=True,  # garder les noms de colonnes
    null_value=""         # remplace les null par vide (plus propre)
)
print("Fichier sauvegardé.", df.shape)

Fichier sauvegardé. (5729195, 15)
