In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [34]:
df = pd.read_csv("DataSet/Clean_Dataset.csv")
df.sample(5)

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
278313,278313,Air_India,AI-507,Hyderabad,Morning,one,Evening,Mumbai,Business,8.75,19,37622
200280,200280,Air_India,AI-569,Chennai,Early_Morning,one,Night,Kolkata,Economy,17.33,26,4629
11849,11849,Vistara,UK-829,Delhi,Early_Morning,one,Evening,Bangalore,Economy,10.25,11,13314
148553,148553,Indigo,6E-563,Kolkata,Morning,zero,Morning,Chennai,Economy,2.42,8,6909
8291,8291,GO_FIRST,G8-2403,Delhi,Morning,one,Evening,Mumbai,Economy,8.0,41,5958


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB


In [5]:
df.drop(columns=['Unnamed: 0','flight'], axis=1, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   source_city       300153 non-null  object 
 2   departure_time    300153 non-null  object 
 3   stops             300153 non-null  object 
 4   arrival_time      300153 non-null  object 
 5   destination_city  300153 non-null  object 
 6   class             300153 non-null  object 
 7   duration          300153 non-null  float64
 8   days_left         300153 non-null  int64  
 9   price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 22.9+ MB


In [7]:
df.corr()

  df.corr()


Unnamed: 0,duration,days_left,price
duration,1.0,-0.039157,0.204222
days_left,-0.039157,1.0,-0.091949
price,0.204222,-0.091949,1.0


In [8]:
X = df.drop(columns=["price"])
y = df["price"]


In [9]:
X.sample(5)

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
246210,Vistara,Bangalore,Evening,one,Morning,Delhi,Business,13.58,19
132171,Air_India,Kolkata,Early_Morning,one,Evening,Mumbai,Economy,13.0,30
290211,Vistara,Chennai,Early_Morning,one,Afternoon,Delhi,Business,7.08,47
205212,Indigo,Chennai,Night,zero,Night,Hyderabad,Economy,1.17,33
130400,GO_FIRST,Kolkata,Early_Morning,one,Afternoon,Mumbai,Economy,6.83,20


In [10]:
y.sample(5)

111004     4337
97719      6798
241688    57453
198963    11069
275976    51875
Name: price, dtype: int64

In [11]:
categorical_cols = X.select_dtypes(include="object").columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print(categorical_cols)
print("")
print(numerical_cols)

['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

['duration', 'days_left']


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
    ],
    remainder="passthrough"
)

In [13]:
# Apply transformation
processed_array = preprocessor.fit_transform(X)

# Get column names for encoded features
encoded_col_names = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)

# Combine with numerical column names
all_col_names = list(encoded_col_names) + numerical_cols

# Convert to DataFrame
processed_df = pd.DataFrame(processed_array, columns=all_col_names)

In [14]:
pd.set_option("display.max_columns", None)

In [15]:
# View first few rows
processed_df.head()

Unnamed: 0,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,source_city_Bangalore,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,source_city_Kolkata,source_city_Mumbai,departure_time_Afternoon,departure_time_Early_Morning,departure_time_Evening,departure_time_Late_Night,departure_time_Morning,departure_time_Night,stops_one,stops_two_or_more,stops_zero,arrival_time_Afternoon,arrival_time_Early_Morning,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai,class_Business,class_Economy,duration,days_left
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.17,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.33,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.17,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.25,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.33,1.0


In [16]:
processed_df.columns

Index(['airline_AirAsia', 'airline_Air_India', 'airline_GO_FIRST',
       'airline_Indigo', 'airline_SpiceJet', 'airline_Vistara',
       'source_city_Bangalore', 'source_city_Chennai', 'source_city_Delhi',
       'source_city_Hyderabad', 'source_city_Kolkata', 'source_city_Mumbai',
       'departure_time_Afternoon', 'departure_time_Early_Morning',
       'departure_time_Evening', 'departure_time_Late_Night',
       'departure_time_Morning', 'departure_time_Night', 'stops_one',
       'stops_two_or_more', 'stops_zero', 'arrival_time_Afternoon',
       'arrival_time_Early_Morning', 'arrival_time_Evening',
       'arrival_time_Late_Night', 'arrival_time_Morning', 'arrival_time_Night',
       'destination_city_Bangalore', 'destination_city_Chennai',
       'destination_city_Delhi', 'destination_city_Hyderabad',
       'destination_city_Kolkata', 'destination_city_Mumbai', 'class_Business',
       'class_Economy', 'duration', 'days_left'],
      dtype='object')

In [17]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
import joblib

# Load the model
model = joblib.load('flight_price_model.pkl')  # or .joblib

In [27]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R^2 Score:", r2)


MAE: 1077.4069235792988
RMSE: 2787.061612860242
R^2 Score: 0.9849311744900549


In [28]:
# Step 1: Get importances and transformed feature names
importances = model.named_steps["regressor"].feature_importances_
feature_names = model.named_steps["preprocessor"].get_feature_names_out()

# Step 2: Manually define mapping rules for each original feature
manual_map = {
    'airline': [col for col in feature_names if 'cat__airline_' in col],
    'source_city': [col for col in feature_names if 'cat__source_city_' in col],
    'departure_time': [col for col in feature_names if 'cat__departure_time_' in col],
    'stops': [col for col in feature_names if 'cat__stops_' in col],
    'arrival_time': [col for col in feature_names if 'cat__arrival_time_' in col],
    'destination_city': [col for col in feature_names if 'cat__destination_city_' in col],
    'class': [col for col in feature_names if 'cat__class_' in col],
    'duration': ['remainder__duration'],
    'days_left': ['remainder__days_left']
}

# Step 3: Reverse the mapping to get transformed column → original feature
column_to_feature = {}
for original, columns in manual_map.items():
    for col in columns:
        column_to_feature[col] = original

# Step 4: Map each transformed column to its original feature
original_features = [column_to_feature.get(name, name) for name in feature_names]

# Step 5: Create DataFrame of importances
importance_df = pd.DataFrame({
    "OriginalFeature": original_features,
    "Importance": importances
})

# Step 6: Aggregate importances and sort
aggregated_df = importance_df.groupby("OriginalFeature", as_index=False).sum()
top_features = aggregated_df.sort_values(by="Importance", ascending=False).reset_index(drop=True)

# Step 7: Show top features
top_features

Unnamed: 0,OriginalFeature,Importance
0,class,0.879856
1,duration,0.057787
2,days_left,0.018507
3,source_city,0.010984
4,airline,0.010832
5,destination_city,0.010735
6,arrival_time,0.005006
7,departure_time,0.004379
8,stops,0.001914


# SAVED THE MODEL

In [29]:
import joblib
joblib.dump(model, "flight_price_model.pkl")


['flight_price_model.pkl']

## User Input Based Prediction

In [30]:
df

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265
300149,Vistara,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105
300150,Vistara,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099
300151,Vistara,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.00,49,81585


In [33]:
for col in df.columns.tolist():
    print(df[col].unique().tolist())


['SpiceJet', 'AirAsia', 'Vistara', 'GO_FIRST', 'Indigo', 'Air_India']
['Delhi', 'Mumbai', 'Bangalore', 'Kolkata', 'Hyderabad', 'Chennai']
['Evening', 'Early_Morning', 'Morning', 'Afternoon', 'Night', 'Late_Night']
['zero', 'one', 'two_or_more']
['Night', 'Morning', 'Early_Morning', 'Afternoon', 'Evening', 'Late_Night']
['Mumbai', 'Bangalore', 'Kolkata', 'Hyderabad', 'Chennai', 'Delhi']
['Economy', 'Business']
[2.17, 2.33, 2.25, 2.08, 12.25, 16.33, 11.75, 14.5, 15.67, 3.75, 2.5, 5.83, 8.0, 6.0, 14.67, 16.17, 18.0, 23.17, 24.17, 8.83, 4.5, 15.25, 11.0, 19.08, 22.83, 26.42, 17.75, 19.58, 26.67, 15.17, 20.83, 11.42, 22.25, 26.0, 21.75, 3.83, 4.42, 7.67, 8.33, 10.42, 23.75, 19.5, 6.5, 12.42, 21.08, 28.17, 28.25, 9.25, 17.92, 7.08, 13.83, 7.58, 15.83, 24.42, 4.17, 4.25, 5.08, 29.33, 17.0, 27.17, 24.75, 5.75, 12.75, 13.75, 17.83, 5.5, 23.83, 5.0, 26.5, 12.83, 8.92, 11.17, 12.17, 15.58, 15.75, 7.92, 13.25, 16.0, 22.75, 6.33, 7.25, 30.08, 18.25, 6.08, 2.0, 12.33, 3.5, 10.25, 14.17, 25.58, 4.08,