In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def load_data(file_path):
    """Loads the dataset from a CSV file."""
    return pd.read_csv(file_path)

def clean_data(df):
    """Cleans the dataset by handling missing values and duplicates."""
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    return df

def preprocess_data(df):
    """Preprocesses data by handling categorical variables and converting 'Duration'."""
    
    # Convert 'Duration' into total minutes
    def convert_duration(duration):
        h, m = 0, 0
        if 'h' in duration:
            h = int(duration.split('h')[0])
            duration = duration.split('h')[1]
        if 'm' in duration:
            m = int(duration.split('m')[0])
        return h * 60 + m
    
    df['Duration'] = df['Duration'].apply(convert_duration)
    
    return df


def preprocess_data(df):
    """Performs feature engineering and encodes categorical variables."""
    
    # Convert date columns to datetime format
    df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y')
    df['Journey_Day'] = df['Date_of_Journey'].dt.day
    df['Journey_Month'] = df['Date_of_Journey'].dt.month
    df.drop(['Date_of_Journey'], axis=1, inplace=True)
    
    # Convert Departure and Arrival Time
    df['Dep_Hour'] = pd.to_datetime(df['Dep_Time']).dt.hour
    df['Dep_Minute'] = pd.to_datetime(df['Dep_Time']).dt.minute
    df.drop(['Dep_Time'], axis=1, inplace=True)
    
    df['Arrival_Hour'] = pd.to_datetime(df['Arrival_Time']).dt.hour
    df['Arrival_Minute'] = pd.to_datetime(df['Arrival_Time']).dt.minute
    df.drop(['Arrival_Time'], axis=1, inplace=True)
    
    # Encoding categorical features
    label_encoder = LabelEncoder()
    categorical_features = ['Airline', 'Source', 'Destination', 'Route', 'Additional_Info']
    for feature in categorical_features:
        df[feature] = label_encoder.fit_transform(df[feature])
    
    # Convert Total_Stops to numeric
    df['Total_Stops'] = df['Total_Stops'].replace({'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4})
    
    return df

if __name__ == "__main__":
    file_path = r"C:\Users\Public\GUVI\code\flight price prediction\Flight_Price.csv"  # Update with your actual file path
    df = load_data(file_path)
    df = preprocess_data(df)
    df = clean_data(df)
    df.to_csv("cleaned_flight_data.csv", index=False)
    print("Data preprocessing complete. Saved as 'cleaned_flight_data.csv'")


  df['Dep_Hour'] = pd.to_datetime(df['Dep_Time']).dt.hour
  df['Dep_Minute'] = pd.to_datetime(df['Dep_Time']).dt.minute
  df['Arrival_Hour'] = pd.to_datetime(df['Arrival_Time']).dt.hour
  df['Arrival_Minute'] = pd.to_datetime(df['Arrival_Time']).dt.minute
  df['Total_Stops'] = df['Total_Stops'].replace({'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4})


Data preprocessing complete. Saved as 'cleaned_flight_data.csv'


In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def load_data(file_path):
    """Loads the cleaned dataset from a CSV file."""
    return pd.read_csv(file_path)

def perform_eda(df):
    """Performs exploratory data analysis (EDA) and visualizations."""
    
    print("Dataset Overview:")
    print(df.info())
    print("\nStatistical Summary:")
    print(df.describe())
    
    # Correlation heatmap
    plt.figure(figsize=(12,6))
    sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Feature Correlation Heatmap")
    plt.show()
    
    # Distribution of Flight Prices
    plt.figure(figsize=(8,5))
    sns.histplot(df['Price'], bins=30, kde=True)
    plt.title("Flight Price Distribution")
    plt.xlabel("Price")
    plt.ylabel("Frequency")
    plt.show()
    
    # Boxplot of prices by airline
    plt.figure(figsize=(12,6))
    sns.boxplot(x=df['Airline'], y=df['Price'])
    plt.xticks(rotation=90)
    plt.title("Flight Prices by Airline")
    plt.show()
    
    # Count of flights by total stops
    plt.figure(figsize=(8,5))
    sns.countplot(x=df['Total_Stops'])
    plt.title("Count of Flights by Number of Stops")
    plt.xlabel("Total Stops")
    plt.ylabel("Count")
    plt.show()
    
if __name__ == "__main__":
    file_path = r"C:\Users\Public\GUVI\code\flight price prediction\cleaned_flight_data.csv"  # Update with your actual file path
    df = load_data(file_path)
    perform_eda(df)
    print("Exploratory Data Analysis complete.")


Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10460 entries, 0 to 10459
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          10460 non-null  int64  
 1   Source           10460 non-null  int64  
 2   Destination      10460 non-null  int64  
 3   Route            10460 non-null  int64  
 4   Duration         10460 non-null  object 
 5   Total_Stops      10460 non-null  float64
 6   Additional_Info  10460 non-null  int64  
 7   Price            10460 non-null  int64  
 8   Journey_Day      10460 non-null  int64  
 9   Journey_Month    10460 non-null  int64  
 10  Dep_Hour         10460 non-null  int64  
 11  Dep_Minute       10460 non-null  int64  
 12  Arrival_Hour     10460 non-null  int64  
 13  Arrival_Minute   10460 non-null  int64  
dtypes: float64(1), int64(12), object(1)
memory usage: 1.1+ MB
None

Statistical Summary:
           Airline        Source   Destination 

ValueError: could not convert string to float: '2h 50m'

<Figure size 1200x600 with 0 Axes>