# 1. Importing Libraries

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# 2. Reading Data

In [2]:
PROJECT_DIR = r"C:\Users\Sher Mohammed Khan\OneDrive\Desktop\Coding\Python\Projects\ML Project\Flight_Price_Prediction"
DATA_DIR = "data"

In [3]:
os.path.join(PROJECT_DIR, DATA_DIR)

'C:\\Users\\Sher Mohammed Khan\\OneDrive\\Desktop\\Coding\\Python\\Projects\\ML Project\\Flight_Price_Prediction\\data'

In [4]:
def get_data(name):
    file_name = f"{name}.csv"
    file_path = os.path.join(PROJECT_DIR, DATA_DIR, file_name)
    return pd.read_csv(file_path)

In [5]:
flights = get_data("flight_price")
flights.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [6]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


- The dataset contains 10,683 rows and 11 features
- Columns `Route` and `Total_Stops` have missing value each
- The data types of some features isn't appropriate

# 3. Preliminary Analysis

## 3.1 Check Data Types

In [7]:
flights.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [8]:
flights.Date_of_Journey.iloc[6]

'12/03/2019'

In [9]:
flights.Arrival_Time.iloc[6]

'10:25 13 Mar'

## 3.2 Check for Duplicates

In [10]:
flights.duplicated().sum()

220

In [13]:
flights.loc[flights.duplicated(keep=False)]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
33,Jet Airways,15/06/2019,Delhi,Cochin,DEL → NAG → BOM → COK,14:35,12:35 16 Jun,22h,2 stops,In-flight meal not included,10919
49,Jet Airways,27/06/2019,Delhi,Cochin,DEL → AMD → BOM → COK,23:05,19:00 28 Jun,19h 55m,2 stops,In-flight meal not included,11150
73,Jet Airways,24/06/2019,Delhi,Cochin,DEL → AMD → BOM → COK,23:05,12:35 25 Jun,13h 30m,2 stops,No info,12819
81,Jet Airways,24/03/2019,Banglore,New Delhi,BLR → DEL,19:55,22:35,2h 40m,non-stop,No info,7229
87,Jet Airways,6/06/2019,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,04:25 07 Jun,18h 45m,2 stops,No info,13014
...,...,...,...,...,...,...,...,...,...,...,...
10594,Jet Airways,27/06/2019,Delhi,Cochin,DEL → AMD → BOM → COK,23:05,12:35 28 Jun,13h 30m,2 stops,No info,12819
10616,Jet Airways,1/06/2019,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,12:35 02 Jun,26h 55m,2 stops,No info,13014
10634,Jet Airways,6/06/2019,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,12:35 07 Jun,26h 55m,2 stops,In-flight meal not included,11733
10672,Jet Airways,27/06/2019,Delhi,Cochin,DEL → AMD → BOM → COK,23:05,19:00 28 Jun,19h 55m,2 stops,In-flight meal not included,11150


In [14]:
(
    flights
    .loc[flights.duplicated(keep=False)]
    
    # Sort the resulting DataFrame by multiple columns to organize the duplicated entries.
    .sort_values(["Airline", "Date_of_Journey", "Source", "Destination"])
    
)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6321,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
9848,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
572,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
8168,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
1495,Air India,1/04/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,01:20 02 Apr,15h 20m,2 stops,No info,10408
...,...,...,...,...,...,...,...,...,...,...,...
2692,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
2870,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
3711,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,20:30,23:20,2h 50m,non-stop,No check-in baggage included,3873
2634,Vistara,24/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,5403


## 3.3 Observations

- The types of `Date_of_Journey`, `Dep_Time` & `Arrival_Time` should be changed to datetime
- The type of `Duration` & `Total_Stops` is mixed. It should be numeric type
- There are 220 duplicates. These should be removed

# 4. Detailed Analysis

### Airline

In [18]:
flights.Airline

0             IndiGo
1          Air India
2        Jet Airways
3             IndiGo
4             IndiGo
            ...     
10678       Air Asia
10679      Air India
10680    Jet Airways
10681        Vistara
10682      Air India
Name: Airline, Length: 10683, dtype: object

In [20]:
flights["Airline"].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

- **Some of the entries have inaccurate values**

In [28]:
# For practice
(
    # Select the 'Airline' column from the 'flights' DataFrame
    flights.Airline
    # Remove the substring " Premium economy" from the 'Airline' column values
    .str.replace(" Premium economy", "")
    # Remove the substring "Business" from the 'Airline' column values
    .str.replace("Business", "")
    # Convert the resulting strings to title case (first letter of each word capitalized)
    .str.title()
    # Get the unique values from the modified 'Airline' column
    .unique()
)

array(['Indigo', 'Air India', 'Jet Airways', 'Spicejet',
       'Multiple Carriers', 'Goair', 'Vistara', 'Air Asia',
       'Jet Airways ', 'Trujet'], dtype=object)

### Date_of_Journey

In [35]:
flights.Date_of_Journey

0        24/03/2019
1         1/05/2019
2         9/06/2019
3        12/05/2019
4        01/03/2019
            ...    
10678     9/04/2019
10679    27/04/2019
10680    27/04/2019
10681    01/03/2019
10682     9/05/2019
Name: Date_of_Journey, Length: 10683, dtype: object

- **Date_of_Journey is dtype: object it should be DateTime**

- **dayfirst bool, default False**

Specify a date parse order if arg is str or is list-like. 
If True, parses dates with the day first, e.g. "10/11/12" is parsed as 2012-11-10.

In [37]:
pd.to_datetime(flights["Date_of_Journey"], dayfirst=True)

0       2019-03-24
1       2019-05-01
2       2019-06-09
3       2019-05-12
4       2019-03-01
           ...    
10678   2019-04-09
10679   2019-04-27
10680   2019-04-27
10681   2019-03-01
10682   2019-05-09
Name: Date_of_Journey, Length: 10683, dtype: datetime64[ns]

### Source

In [47]:
flights["Source"].unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

- **Source looks fine**

### Destination	

In [49]:
flights['Destination'].unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

- **Destination looks fine**

### Route

In [51]:
flights.Route

0                    BLR → DEL
1        CCU → IXR → BBI → BLR
2        DEL → LKO → BOM → COK
3              CCU → NAG → BLR
4              BLR → NAG → DEL
                 ...          
10678                CCU → BLR
10679                CCU → BLR
10680                BLR → DEL
10681                BLR → DEL
10682    DEL → GOI → BOM → COK
Name: Route, Length: 10683, dtype: object

- **Droping Route because it is not adding any value**

### Dep_Time

In [55]:
flights.Dep_Time

0        22:20
1        05:50
2        09:25
3        18:05
4        16:50
         ...  
10678    19:55
10679    20:45
10680    08:20
10681    11:30
10682    10:55
Name: Dep_Time, Length: 10683, dtype: object

In [56]:
(
    flights
    .Dep_Time
    .loc[lambda ser: ser.str.contains("[^0-9:]")]
)

Series([], Name: Dep_Time, dtype: object)

- **Values are OK in Dep_Time but have to change the dtype to DateTime**

### Breakdown:

1. **`flights.Dep_Time`**: 
   - This selects the `Dep_Time` column from the `flights` DataFrame, which  contains the departure times of flights.

2. **`.loc[lambda ser: ser.str.contains("[^0-9:]")]`**:
   - **`lambda ser:`**: A lambda function is used to define a quick, inline function that operates on the series (`ser`), which in this case is the `Dep_Time` column.
   - **`ser.str.contains("[^0-9:]")`**: 
     - `str.contains("[^0-9:]")` checks each entry in the `Dep_Time` column for any characters that are not digits (`0-9`) or colons (`:`). 
     - The `[^0-9:]` is a regular expression (regex) where `^` inside the brackets indicates "not", so it matches any character that isn't a digit or a colon.
   - **`.loc[...]`**: The `.loc` method is used to filter and return only those rows where the condition inside the lambda function is `True`.
   - The code identifies and returns all rows in the `Dep_Time` column that contain invalid characters (anything other than numbers and colons). This could be useful for data cleaning or validation, especially if the `Dep_Time` column is supposed to only contain time values like "12:30" or "07:45".

In [60]:
pd.to_datetime(flights.Dep_Time).dt.time #For now it is ok as object we will handle this in EDA

0        22:20:00
1        05:50:00
2        09:25:00
3        18:05:00
4        16:50:00
           ...   
10678    19:55:00
10679    20:45:00
10680    08:20:00
10681    11:30:00
10682    10:55:00
Name: Dep_Time, Length: 10683, dtype: object

### Arrival_Time

In [64]:
flights['Arrival_Time']

0        01:10 22 Mar
1               13:15
2        04:25 10 Jun
3               23:30
4               21:35
             ...     
10678           22:25
10679           23:20
10680           11:20
10681           14:10
10682           19:15
Name: Arrival_Time, Length: 10683, dtype: object

In [69]:
(
    flights.Arrival_Time
    .loc[lambda ser: ser.str.contains("[^0-9:]")]
    .str.split(" ", n=1)
    .str.get(1)
    .unique()
)

array(['22 Mar', '10 Jun', '13 Mar', '02 Mar', '10 May', '04 Mar',
       '13 Jun', '28 May', '19 Mar', '07 May', '02 Jun', '16 Jun',
       '19 May', '16 May', '28 Jun', '02 May', '28 Mar', '19 Jun',
       '04 Apr', '25 Mar', '07 Mar', '25 Jun', '07 Jun', '25 May',
       '13 May', '16 Mar', '22 May', '10 Apr', '04 Jun', '20 May',
       '28 Apr', '25 Apr', '10 Mar', '19 Apr', '13 Apr', '02 Apr',
       '23 Mar', '22 Apr', '11 May', '07 Apr', '03 May', '08 Mar',
       '03 Mar', '05 Mar', '22 Jun', '04 May', '26 May', '16 Apr',
       '26 Jun', '29 May', '29 Jun', '29 Mar', '23 May', '17 Jun'],
      dtype=object)

### Duration

In [73]:
flights.Duration

0        2h 50m
1        7h 25m
2           19h
3        5h 25m
4        4h 45m
          ...  
10678    2h 30m
10679    2h 35m
10680        3h
10681    2h 40m
10682    8h 20m
Name: Duration, Length: 10683, dtype: object

- **Convert to Minutes**

In [79]:
(
    flights.Duration
    .loc[lambda ser: ~ser.str.contains("m")]
    .unique()
)

array(['19h', '23h', '22h', '12h', '3h', '5h', '10h', '18h', '24h', '15h',
       '16h', '8h', '14h', '20h', '13h', '11h', '9h', '27h', '26h', '4h',
       '7h', '30h', '21h', '28h', '47h', '6h', '25h', '38h', '34h'],
      dtype=object)

In [81]:
(
    flights.Duration
    .loc[lambda ser: ~ser.str.contains("h")]
)

6474    5m
Name: Duration, dtype: object

In [83]:
flights.iloc[[6474]]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6474,Air India,6/03/2019,Mumbai,Hyderabad,BOM → GOI → PNQ → HYD,16:50,16:55,5m,2 stops,No info,17327


- **Mumbai to Hyderabad in 5 minutes flight Duration is inaccurate/not valid. Deleting this observation**

In [120]:
(
    flights
    .Duration
    .drop(index=[6474]) #droping the inacurate row
    .str.split(" ", expand=True) #expand=True coverts into dataframe
    .set_axis(["hour", "minute"], axis =1) #provided names to columns
    .assign(
        hour = lambda df_:(
            df_
            .hour
            .str.replace("h", "") #replaced "h" with empty str
            .astype(int) #converting hour column from object to int
            .mul(60) #multipling hour with 60 to convert hour to min 
        ),
        minute = lambda df_: (
            df_
            .minute
            .str.replace("m", "") #replaced "m" with empty str
            .fillna("0") #filling null value with 0
            .astype(int) #converting minute column from object to int
        )
    )
    .sum(axis=1) #adding hour and minute
    #.dtypes #checking dtypes of hour and minute
    #.isna().sum()  #0 missing values in hour and minute has 1031 missing values
    
)

0         170
1         445
2        1140
3         325
4         285
         ... 
10678     150
10679     155
10680     180
10681     160
10682     500
Length: 10682, dtype: int64

### Total_Stops

In [127]:
flights.Total_Stops.unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', nan, '4 stops'],
      dtype=object)

- `nan` will be handled in eda

In [132]:
(
    flights
    .Total_Stops
    .str.replace("non-stop", "0")
    .str.replace(" stops?", "", regex=True)
    .pipe(lambda ser: pd.to_numeric(ser)) #astype(int) will not work because we have (nan) value which if float dtype
)

0        0.0
1        2.0
2        2.0
3        1.0
4        1.0
        ... 
10678    0.0
10679    0.0
10680    0.0
10681    0.0
10682    2.0
Name: Total_Stops, Length: 10683, dtype: float64

#### Breakdown
`str.replace(" stops?", "", regex=True)`
- **Removes any occurrence of " stop" or " stops"** from the strings. 
- **Explanation**: The `" stops?"` is a regular expression (regex) where `?` means "zero or one occurrence of the preceding character," so it matches both "stop" and "stops". The `replace` method then removes this matched text from the strings.
- This code cleans up strings by removing both "stop" and "stops" (singular and plural forms).

### additional_info

In [137]:
flights.Additional_Info.unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

- There is `No info` and `No Info` which means same

In [139]:
(
    flights
    .Additional_Info
    .str.replace("No info", "No Info")
    .unique()
)

array(['No Info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

# 5. Cleaning Operations

In [15]:
flights.select_dtypes(include = "O").columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info'],
      dtype='object')

### Breakdown:
**`flights.select_dtypes(include = "O").columns`**

- **`flights.select_dtypes(include="O")`**: Selects all columns in the `flights` DataFrame that have the data type "object" (denoted by `"O"`).
- **`.columns`**: Returns the names of these columns as an Index object.

In [121]:
# As code is long, that's why creating a function for converting an hour to a minute

def convert_minutes(ser):
    return (
        ser
        .str.split(" ", expand=True) #expand=True coverts into dataframe
        .set_axis(["hour", "minute"], axis =1) #provided names to columns
        .assign(
            hour = lambda df_:(
                df_
                .hour
                .str.replace("h", "") #replaced "h" with empty str
                .astype(int) #converting hour column from object to int
                .mul(60) #multipling hour with 60 to convert hour to min 
            ),
            minute = lambda df_: (
                df_
                .minute
                .str.replace("m", "") #replaced "m" with empty str
                .fillna("0") #filling null value with 0
                .astype(int) #converting minute column from object to int
            )
        )
        .sum(axis=1) #adding hour and minute
    )

In [146]:
def clean_data(df):
    return(
        df
        .drop(index=[6474])
        .drop_duplicates()
        .assign(**{
            col:df[col].str.strip()
            for col in df.select_dtypes(include = "O").columns
        })
        
        .rename(columns = str.lower) # Convert all column names to lowercase
        
        .assign(
            airline = lambda df_:(
                df_
                .airline
                .str.replace(" Premium economy", "")
                .str.replace("Business", "")
                .str.title()
            ),
            date_of_journey = lambda df_ : pd.to_datetime(df_.date_of_journey, dayfirst=True),
            dep_time = lambda df_:pd.to_datetime(df_.dep_time).dt.time,
            arrival_time = lambda df_:pd.to_datetime(df_.arrival_time).dt.time,
            duration = lambda df_ : df_.duration.pipe(convert_minutes), #convert_minutes(df_.duration) will also work (Refer pd.pipe documentation)
            total_stops = lambda df_:(
                df_
                .total_stops
                .str.replace("non-stop", "0")
                .str.replace(" stops?", "", regex=True)
                .pipe(lambda ser: pd.to_numeric(ser))
            ),
            additional_info = lambda df_:df_.additional_info.replace("No info", "No Info") 
        )
        .drop(columns="route")
    )

### Breakdown:

        df
        .assign(**{
            col:df[col].str.strip()
            for col in df.select_dtypes(include = "O").columns
        })

- **`.assign(**{...})`**: This is used to update or create new columns in a DataFrame.
  
- **`**{...}`**: This syntax lets you pass a dictionary where the keys are column names and the values are the new data for those columns.

- The `.assign(**{...})` is a way to change or add multiple columns in one step. The `**{...}` part lets you provide a list of column names and what you want to do with each of them, all at once.

- **`{col: df[col].str.strip() for col in df.select_dtypes(include = "O").columns}`**: 
  - This is a dictionary comprehension that iterates over each column (`col`) in the DataFrame that has a data type of "object" (text or categorical).
  - `df.select_dtypes(include="O").columns` retrieves all the column names with "object" data type.
  - `df[col].str.strip()` removes any leading or trailing whitespace from the strings in each of these columns.
  - The code removes any leading or trailing whitespace from all text or categorical columns in the DataFrame. This is useful for cleaning data where such whitespace might cause issues with analysis or comparisons.

In [147]:
clean_data(flights)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-03-24,Banglore,New Delhi,22:20:00,01:10:00,170,0.0,No Info,3897
1,Air India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,445,2.0,No Info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,09:25:00,04:25:00,1140,2.0,No Info,13882
3,Indigo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,325,1.0,No Info,6218
4,Indigo,2019-03-01,Banglore,New Delhi,16:50:00,21:35:00,285,1.0,No Info,13302
...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,2019-04-09,Kolkata,Banglore,19:55:00,22:25:00,150,0.0,No Info,4107
10679,Air India,2019-04-27,Kolkata,Banglore,20:45:00,23:20:00,155,0.0,No Info,4145
10680,Jet Airways,2019-04-27,Banglore,Delhi,08:20:00,11:20:00,180,0.0,No Info,7229
10681,Vistara,2019-03-01,Banglore,New Delhi,11:30:00,14:10:00,160,0.0,No Info,12648


In [145]:
flights_cleaned = clean_data(flights)
flights_cleaned.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-03-24,Banglore,New Delhi,22:20:00,01:10:00,170,0.0,No Info,3897
1,Air India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,445,2.0,No Info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,09:25:00,04:25:00,1140,2.0,No Info,13882
3,Indigo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,325,1.0,No Info,6218
4,Indigo,2019-03-01,Banglore,New Delhi,16:50:00,21:35:00,285,1.0,No Info,13302


# 6. Split the Data

In [149]:
X = flights_cleaned.drop(columns="price")
y = flights_cleaned["price"].copy()

In [152]:
X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(6836, 9) (6836,)
(1709, 9) (1709,)
(2137, 9) (2137,)


# 7. Export the Subsets

In [153]:
def export_data(X, y, name):
    file_name = f"{name}.csv"
    file_path = os.path.join(PROJECT_DIR, DATA_DIR, file_name)
    X.join(y).to_csv(file_path, index = False)
    return pd.read_csv(file_path).head()

In [154]:
export_data(X_train, y_train, "train")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,No Info,17024
1,Vistara,2019-03-24,Kolkata,Banglore,07:10:00,18:45:00,695,1.0,No Info,16932
2,Spicejet,2019-04-09,Banglore,Delhi,09:30:00,12:20:00,170,0.0,No Info,4423
3,Indigo,2019-04-27,Banglore,Delhi,21:15:00,00:15:00,180,0.0,No Info,3943
4,Air India,2019-06-12,Delhi,Cochin,09:45:00,09:25:00,1420,1.0,No Info,7480


In [155]:
export_data(X_val, y_val, "val")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-12,Kolkata,Banglore,16:30:00,12:00:00,1170,1.0,In-flight meal not included,8529
1,Jet Airways,2019-05-18,Banglore,Delhi,11:10:00,14:05:00,175,0.0,In-flight meal not included,5198
2,Multiple Carriers,2019-06-12,Delhi,Cochin,08:45:00,19:00:00,615,1.0,No Info,11789
3,Multiple Carriers,2019-04-24,Delhi,Cochin,07:10:00,16:10:00,540,1.0,In-flight meal not included,6093
4,Jet Airways,2019-03-27,Delhi,Cochin,15:05:00,04:25:00,800,1.0,No Info,12242


In [156]:
export_data(X_test, y_test, "test")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Multiple Carriers,2019-05-21,Delhi,Cochin,15:05:00,01:30:00,625,2.0,No Info,16655
1,Goair,2019-06-03,Delhi,Cochin,10:35:00,19:35:00,540,1.0,No Info,4959
2,Vistara,2019-05-09,Kolkata,Banglore,20:20:00,09:05:00,765,1.0,No Info,9187
3,Indigo,2019-05-24,Chennai,Kolkata,14:45:00,17:05:00,140,0.0,No Info,3858
4,Jet Airways,2019-05-21,Delhi,Cochin,22:50:00,04:25:00,335,1.0,In-flight meal not included,12898


In [158]:
get_data("train")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,No Info,17024
1,Vistara,2019-03-24,Kolkata,Banglore,07:10:00,18:45:00,695,1.0,No Info,16932
2,Spicejet,2019-04-09,Banglore,Delhi,09:30:00,12:20:00,170,0.0,No Info,4423
3,Indigo,2019-04-27,Banglore,Delhi,21:15:00,00:15:00,180,0.0,No Info,3943
4,Air India,2019-06-12,Delhi,Cochin,09:45:00,09:25:00,1420,1.0,No Info,7480
...,...,...,...,...,...,...,...,...,...,...
6831,Indigo,2019-06-03,Banglore,Delhi,04:00:00,06:50:00,170,0.0,No Info,3943
6832,Indigo,2019-06-24,Delhi,Cochin,05:05:00,16:10:00,665,1.0,No Info,6442
6833,Air India,2019-05-09,Banglore,Delhi,10:00:00,12:45:00,165,0.0,No Info,5228
6834,Jet Airways,2019-05-27,Delhi,Cochin,07:05:00,12:35:00,330,1.0,In-flight meal not included,12898
