In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import tabulate
from tabulate import tabulate

In [11]:
file_paths = [
    r"C:\Users\amena\OneDrive - ESPRIT\Bureau\data\Istanbul_Tunis_100_Vols.csv",
    r"C:\Users\amena\OneDrive - ESPRIT\Bureau\data\Paris_Tunis_100_Vols.csv",
   r"C:\Users\amena\OneDrive - ESPRIT\Bureau\data\Rome_Tunis_100_Vols.csv",
     r"C:\Users\amena\OneDrive - ESPRIT\Bureau\data\Barcelone_Tunis_100_Vols.csv"
]


In [12]:
for file_path in file_paths:
    try:
        # Load dataset with correct separator
        df = pd.read_csv(file_path, encoding="utf-8", sep=";")

        print(f"\n📂 Processing file: {file_path}")
        print("\n✅ Data Preview:")
        print(df.head())  # Show the first few rows to verify column separation

        print("\n📌 Column Names:")
        print(df.columns)  # Ensure columns are correctly separated

    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")


📂 Processing file: C:\Users\amena\OneDrive - ESPRIT\Bureau\data\Istanbul_Tunis_100_Vols.csv

✅ Data Preview:
          Compagnie Pays de départ Pays de destination Date de départ  \
0  Turkish Airlines        Turquie             Tunisie     10/02/2025   
1          Tunisair        Turquie             Tunisie     11/02/2025   
2  Turkish Airlines        Turquie             Tunisie     12/02/2025   
3         Nouvelair        Turquie             Tunisie     13/02/2025   
4          Tunisair        Turquie             Tunisie     14/02/2025   

  Heure de départ Date de retour Heure d'arrivée       Durée Prix (€)  
0           21:30     17/02/2025           22:30         3 h    € 765  
1           02:55     18/02/2025           03:50  2 h 55 min    € 263  
2           21:30     19/02/2025           22:30         3 h    € 438  
3           12:50     20/02/2025           13:50         3 h    € 361  
4           12:35     21/02/2025           13:20  2 h 45 min    € 261  

📌 Column Names:
In

In [None]:
# Loop through each file in file_paths and display unique values
for file_path in file_paths:
    try:
        # Load dataset
        df = pd.read_csv(file_path, encoding="utf-8", sep=";")
        
        print(f"\n📂 Processing file: {file_path}")
        print("\n🔍 Valeurs uniques par colonne :")

        # Display unique values for categorical columns
        for col in df.select_dtypes(include="object").columns:
            print(f"\n📌 {col} ({df[col].nunique()} valeurs uniques) :")
            print(df[col].unique())

    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")



📂 Processing file: C:\Users\amena\OneDrive - ESPRIT\Bureau\data\Istanbul_Tunis_100_Vols.csv

🔍 Valeurs uniques par colonne :

📌 Compagnie (3 valeurs uniques) :
['Turkish Airlines' 'Tunisair' 'Nouvelair']

📌 Pays de départ (1 valeurs uniques) :
['Turquie']

📌 Pays de destination (1 valeurs uniques) :
['Tunisie']

📌 Date de départ (100 valeurs uniques) :
['10/02/2025' '11/02/2025' '12/02/2025' '13/02/2025' '14/02/2025'
 '15/02/2025' '16/02/2025' '17/02/2025' '18/02/2025' '19/02/2025'
 '20/02/2025' '21/02/2025' '22/02/2025' '23/02/2025' '24/02/2025'
 '25/02/2025' '26/02/2025' '27/02/2025' '28/02/2025' '01/03/2025'
 '02/03/2025' '03/03/2025' '04/03/2025' '05/03/2025' '06/03/2025'
 '07/03/2025' '08/03/2025' '09/03/2025' '10/03/2025' '11/03/2025'
 '12/03/2025' '13/03/2025' '14/03/2025' '15/03/2025' '16/03/2025'
 '17/03/2025' '18/03/2025' '19/03/2025' '20/03/2025' '21/03/2025'
 '22/03/2025' '23/03/2025' '24/03/2025' '25/03/2025' '26/03/2025'
 '27/03/2025' '28/03/2025' '29/03/2025' '30/03/202

In [6]:
def detect_outliers(series):

    try:

        if series.dtype in [np.int64, np.float64]:

            Q1 = series.quantile(0.25)

            Q3 = series.quantile(0.75)

            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR

            upper_bound = Q3 + 1.5 * IQR

            outliers = series[(series < lower_bound) | (series > upper_bound)]

            return len(outliers)

    except:

        return "N/A"

    return "N/A"


In [None]:
def analyze_csv(file_path):

    summary = []

    for col in df.columns:

        col_type = df[col].dtype

        missing_values = df[col].isnull().mean() * 100

        unique_values = df[col].nunique()

        outliers = detect_outliers(df[col])


        description = "General data column"

        if "Prix (€)" in col.lower():

            description = "Price value in €"

        elif "Compagnie" in col.lower():

            description = "Name of the company"


        summary.append([col, col_type, f"{missing_values:.2f}%", unique_values, outliers, description])


    summary_df = pd.DataFrame(summary, columns=["Column Name", "Type", "Missing Values (%)", "Unique Values", "Outliers", "Description"])

    return os.path.basename(file_path), summary_df


In [None]:

# Process each CSV file safely and format the output better
reports = []

for file in file_paths:

    file_name, report = analyze_csv(file)

    if report is not None:

        reports.append((file_name, report))


# Display results in a structured table format
for file_name, report in reports:

    print(f"\n📌 Data Discovery Report for: {file_name}\n")

    print(tabulate(report, headers="keys", tablefmt="grid"))

    print("\n" + "="*80 + "\n")



📌 Data Discovery Report for: Istanbul_Tunis_100_Vols.csv

+----+---------------------+---------+----------------------+-----------------+------------+---------------------+
|    | Column Name         | Type    | Missing Values (%)   |   Unique Values | Outliers   | Description         |
|  0 | Compagnie           | object  | 0.00%                |               7 | N/A        | General data column |
+----+---------------------+---------+----------------------+-----------------+------------+---------------------+
|  1 | Pays de départ      | object  | 0.00%                |               1 | N/A        | General data column |
+----+---------------------+---------+----------------------+-----------------+------------+---------------------+
|  2 | Pays de destination | object  | 0.00%                |               1 | N/A        | General data column |
+----+---------------------+---------+----------------------+-----------------+------------+---------------------+
|  3 | Date de départ