<a href="https://colab.research.google.com/github/abinayasvam1/Tkinter-/blob/main/flight_ticket_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# for data reading and data manipulation
import numpy as np
import pandas as pd
import statistics as st

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for model creation and model evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [4]:
path = "/content/drive/MyDrive/train-2.csv"
df = pd.read_csv(path)
pd.set_option('display.max_columns',None)
df.head()

Unnamed: 0,Flight_ID,Airline,Departure_City,Arrival_City,Distance,Departure_Time,Arrival_Time,Duration,Aircraft_Type,Number_of_Stops,Day_of_Week,Month_of_Travel,Holiday_Season,Demand,Weather_Conditions,Passenger_Count,Promotion_Type,Fuel_Price,Flight_Price
0,F1,Airline B,,Greenshire,8286.0,8:23,20:19,11.94,Boeing 787,0,Wednesday,December,Summer,Low,Rain,240,Special Offer,0.91,643.93
1,F2,Airline C,Leonardland,New Stephen,2942.0,20:28,1:45,5.29,Airbus A320,0,Wednesday,March,Spring,Low,Rain,107,,1.08,423.13
2,F3,Airline B,South Dylanville,Port Ambermouth,2468.0,11:30,15:54,4.41,Boeing 787,1,Sunday,September,Summer,High,Cloudy,131,,0.52,442.17
3,F4,,Blakefort,Crosbyberg,3145.0,20:24,1:21,4.96,Boeing 787,0,Sunday,February,Fall,Low,Cloudy,170,Discount,0.71,394.42
4,F5,Airline B,Michaelport,Onealborough,5558.0,21:59,6:04,8.09,Boeing 737,1,Thursday,January,,,Clear,181,,1.09,804.35


In [5]:
df.shape

(45000, 19)

As the names of all the columns in the DataFrame are in CAPS, I would like to rename all the columns to it's lower_case form,

So later it would be efficient for me to work with the dataframe. For this I will create a function 'rename_cols'.

In [7]:
df.columns = [column.lower() for column in df.columns]

In [6]:
df.columns

Index(['Flight_ID', 'Airline', 'Departure_City', 'Arrival_City', 'Distance',
       'Departure_Time', 'Arrival_Time', 'Duration', 'Aircraft_Type',
       'Number_of_Stops', 'Day_of_Week', 'Month_of_Travel', 'Holiday_Season',
       'Demand', 'Weather_Conditions', 'Passenger_Count', 'Promotion_Type',
       'Fuel_Price', 'Flight_Price'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,flight_id,airline,departure_city,arrival_city,distance,departure_time,arrival_time,duration,aircraft_type,number_of_stops,day_of_week,month_of_travel,holiday_season,demand,weather_conditions,passenger_count,promotion_type,fuel_price,flight_price
0,F1,Airline B,,Greenshire,8286.0,8:23,20:19,11.94,Boeing 787,0,Wednesday,December,Summer,Low,Rain,240,Special Offer,0.91,643.93
1,F2,Airline C,Leonardland,New Stephen,2942.0,20:28,1:45,5.29,Airbus A320,0,Wednesday,March,Spring,Low,Rain,107,,1.08,423.13
2,F3,Airline B,South Dylanville,Port Ambermouth,2468.0,11:30,15:54,4.41,Boeing 787,1,Sunday,September,Summer,High,Cloudy,131,,0.52,442.17
3,F4,,Blakefort,Crosbyberg,3145.0,20:24,1:21,4.96,Boeing 787,0,Sunday,February,Fall,Low,Cloudy,170,Discount,0.71,394.42
4,F5,Airline B,Michaelport,Onealborough,5558.0,21:59,6:04,8.09,Boeing 737,1,Thursday,January,,,Clear,181,,1.09,804.35


In [9]:
print(f'➤ The DataFrame(df) contains {df.shape[0]} rows and {df.shape[1]} columns.')

➤ The DataFrame(df) contains 45000 rows and 19 columns.


Here I can clearly sense that there is a column which is no way can contribute for the price prediction of flight ticket : 1.) flight_id

Hence it is better to drop that column before further analysis.

In [10]:
df.drop(columns = ['flight_id','departure_time','arrival_time'],inplace=True)

In [11]:
print(f'➤ Now the shape of the Dataframe is {df.shape[0]} rows and {df.shape[1]} columns.')

➤ Now the shape of the Dataframe is 45000 rows and 16 columns.


In [12]:
df.head(2)

Unnamed: 0,airline,departure_city,arrival_city,distance,duration,aircraft_type,number_of_stops,day_of_week,month_of_travel,holiday_season,demand,weather_conditions,passenger_count,promotion_type,fuel_price,flight_price
0,Airline B,,Greenshire,8286.0,11.94,Boeing 787,0,Wednesday,December,Summer,Low,Rain,240,Special Offer,0.91,643.93
1,Airline C,Leonardland,New Stephen,2942.0,5.29,Airbus A320,0,Wednesday,March,Spring,Low,Rain,107,,1.08,423.13


#**Checking** and Handling Missing Values
Checking and Handling Missing Values

In [13]:
# looking for the amount of null data in the pandas dataframe
df.isnull().sum()

airline               3573
departure_city         340
arrival_city           186
distance                91
duration                 0
aircraft_type           43
number_of_stops          0
day_of_week            225
month_of_travel        267
holiday_season           0
demand                 317
weather_conditions     302
passenger_count          0
promotion_type         403
fuel_price              90
flight_price             0
dtype: int64

In [14]:
# looking for the percentage of null data in the dataframe
pd.DataFrame(df.isna().mean()*100).T

Unnamed: 0,airline,departure_city,arrival_city,distance,duration,aircraft_type,number_of_stops,day_of_week,month_of_travel,holiday_season,demand,weather_conditions,passenger_count,promotion_type,fuel_price,flight_price
0,7.94,0.755556,0.413333,0.202222,0.0,0.095556,0.0,0.5,0.593333,0.0,0.704444,0.671111,0.0,0.895556,0.2,0.0


Handling missing values

In [15]:
# filling the missing values using statistical techniques

df = df.fillna({
    'airline': df['airline'].mode()[0],
    'departure_city': df['departure_city'].mode()[0],
    'arrival_city': df['arrival_city'].mode()[0],
    'aircraft_type': df['aircraft_type'].mode()[0],
    'day_of_week': df['day_of_week'].mode()[0],
    'month_of_travel': df['month_of_travel'].mode()[0],
    'demand': df['demand'].mode()[0],
    'weather_conditions': df['weather_conditions'].mode()[0],
    'promotion_type': df['promotion_type'].mode()[0],
})

In [16]:
#condinues data fill with Mean or median
# filling the missing values using statistical techniques
df = df.fillna({'distance': df['distance'].mean(),
               'fuel_price': df['fuel_price'].mean()})

In [17]:
# cross  checking  for  null  values
df.isnull().sum()

airline               0
departure_city        0
arrival_city          0
distance              0
duration              0
aircraft_type         0
number_of_stops       0
day_of_week           0
month_of_travel       0
holiday_season        0
demand                0
weather_conditions    0
passenger_count       0
promotion_type        0
fuel_price            0
flight_price          0
dtype: int64

**Data Type Correction**

In [18]:
df.dtypes

airline                object
departure_city         object
arrival_city           object
distance              float64
duration              float64
aircraft_type          object
number_of_stops         int64
day_of_week            object
month_of_travel        object
holiday_season         object
demand                 object
weather_conditions     object
passenger_count         int64
promotion_type         object
fuel_price            float64
flight_price          float64
dtype: object

In [19]:
df.head()

Unnamed: 0,airline,departure_city,arrival_city,distance,duration,aircraft_type,number_of_stops,day_of_week,month_of_travel,holiday_season,demand,weather_conditions,passenger_count,promotion_type,fuel_price,flight_price
0,Airline B,Port Michael,Greenshire,8286.0,11.94,Boeing 787,0,Wednesday,December,Summer,Low,Rain,240,Special Offer,0.91,643.93
1,Airline C,Leonardland,New Stephen,2942.0,5.29,Airbus A320,0,Wednesday,March,Spring,Low,Rain,107,,1.08,423.13
2,Airline B,South Dylanville,Port Ambermouth,2468.0,4.41,Boeing 787,1,Sunday,September,Summer,High,Cloudy,131,Special Offer,0.52,442.17
3,Airline A,Blakefort,Crosbyberg,3145.0,4.96,Boeing 787,0,Sunday,February,Fall,Low,Cloudy,170,Discount,0.71,394.42
4,Airline B,Michaelport,Onealborough,5558.0,8.09,Boeing 737,1,Thursday,January,,Low,Clear,181,,1.09,804.35


In [20]:
df.shape

(45000, 16)

In [21]:
# viewing how many columns in the dataframe fall in the object and the numeric data types

pd.DataFrame(df.dtypes.value_counts()).T

Unnamed: 0,object,float64,int64
0,10,4,2


In [23]:
integer_data_cols = [var for var in df.columns if df[var].dtype == 'int64']
integer_data_cols

['number_of_stops', 'passenger_count']

In [24]:
float_data_cols = [var for var in df.columns if df[var].dtype == 'float64']
float_data_cols

['distance', 'duration', 'fuel_price', 'flight_price']

In [25]:
object_data_cols = [var for var in df.columns if df[var].dtype == 'object']
object_data_cols

['airline',
 'departure_city',
 'arrival_city',
 'aircraft_type',
 'day_of_week',
 'month_of_travel',
 'holiday_season',
 'demand',
 'weather_conditions',
 'promotion_type']

As the number of object data type columns is more it is important to know how each object data column is divided into categories. Let's explore all the categorical columns and know into how many categories they are divided into.

In [26]:
for i in object_data_cols:
  print(f'Column "{i}" is divided into "{len(df[i].value_counts())}" categories.')

Column "airline" is divided into "3" categories.
Column "departure_city" is divided into "23187" categories.
Column "arrival_city" is divided into "23479" categories.
Column "aircraft_type" is divided into "5" categories.
Column "day_of_week" is divided into "7" categories.
Column "month_of_travel" is divided into "12" categories.
Column "holiday_season" is divided into "5" categories.
Column "demand" is divided into "3" categories.
Column "weather_conditions" is divided into "4" categories.
Column "promotion_type" is divided into "3" categories.


*The categorical columns are having too many categories, seems like it is needed to analyze all of them one by one.*

In [28]:
# for column "airline"
df['airline'].value_counts()

Airline A    17436
Airline C    13809
Airline B    13755
Name: airline, dtype: int64

In [29]:
# for column "departure_city"
df['departure_city'].value_counts()

Port Michael         389
West Michael          47
North Michael         42
South Michael         40
North David           39
                    ... 
New Jordanchester      1
North Morgan           1
North Mindy            1
Mcmillanberg           1
East Patriciafurt      1
Name: departure_city, Length: 23187, dtype: int64

In [30]:
# for column "arrival_city"
df['arrival_city'].value_counts()

North Michael       235
Port Michael         47
Lake Michael         42
New Michael          35
Michaelmouth         35
                   ... 
Summerland            1
North Lindafurt       1
West Waynetown        1
North Keithmouth      1
Gabrielville          1
Name: arrival_city, Length: 23479, dtype: int64

In [31]:
# for column "aircraft_type"
df['aircraft_type'].value_counts()

Airbus A320    9154
Boeing 777     9059
Boeing 787     8952
Airbus A380    8943
Boeing 737     8892
Name: aircraft_type, dtype: int64

In [32]:
#for column "day_of_week"
df['day_of_week'].value_counts()

Sunday       6697
Saturday     6418
Friday       6414
Monday       6407
Wednesday    6406
Tuesday      6336
Thursday     6322
Name: day_of_week, dtype: int64

In [33]:
# for column "month_of_travel"
df['month_of_travel'].value_counts()

January      4110
October      3812
April        3776
July         3755
November     3749
September    3731
March        3727
August       3712
December     3700
June         3672
May          3658
February     3598
Name: month_of_travel, dtype: int64

In [34]:
# for column "holiday_season"
df['holiday_season'].value_counts()

Spring    9036
None      9020
Fall      9010
Summer    8970
Winter    8964
Name: holiday_season, dtype: int64

In [35]:
# for column "demand"
df['demand'].value_counts()

Low       29263
Medium     8954
High       6783
Name: demand, dtype: int64

In [36]:
# for column "weather_conditions"
df['weather_conditions'].value_counts()

Cloudy    11711
Snow      11148
Rain      11093
Clear     11048
Name: weather_conditions, dtype: int64

In [37]:
# for column "promotion_type"
df['promotion_type'].value_counts()

Special Offer    15299
Discount         14889
None             14812
Name: promotion_type, dtype: int64

In [38]:
df = df.drop_duplicates()

In [39]:
df.shape

(45000, 16)

**Data Visualization.**