In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

In [2]:
df = pd.read_csv(r"E:\LLM\01Projects\airline_ai_assistant\datasets\Indian Airlines.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [3]:
df.drop(columns=['Unnamed: 0'], inplace=True)

# Pipeline

In [4]:
onehot_cols = ['airline', 'flight', 'source_city', 'departure_time', 'arrival_time', 'destination_city']
Ordinal_encoding = ColumnTransformer([
    ('ordinal_class', OrdinalEncoder(categories=[['Economy', 'Business']]), [7]),
    ('ordinal_stops', OrdinalEncoder(categories=[['zero', 'one', 'two_or_more']]), [4]),
    ('ohe_encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), onehot_cols)
], remainder='passthrough')


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   flight            300153 non-null  object 
 2   source_city       300153 non-null  object 
 3   departure_time    300153 non-null  object 
 4   stops             300153 non-null  object 
 5   arrival_time      300153 non-null  object 
 6   destination_city  300153 non-null  object 
 7   class             300153 non-null  object 
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 25.2+ MB


In [6]:
df.describe()

Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0
mean,12.221021,26.004751,20889.660523
std,7.191997,13.561004,22697.767366
min,0.83,1.0,1105.0
25%,6.83,15.0,4783.0
50%,11.25,26.0,7425.0
75%,16.17,38.0,42521.0
max,49.83,49.0,123071.0


In [7]:
df['class'].value_counts()

class
Economy     206666
Business     93487
Name: count, dtype: int64

In [8]:
df['stops'].value_counts()

stops
one            250863
zero            36004
two_or_more     13286
Name: count, dtype: int64

In [9]:
pipe = Pipeline([
    ("OrdinalEncoder", Ordinal_encoding),
])

In [10]:
# pipe.fit(df)

In [11]:
# pipe.transform(df)[1]

In [12]:
df.iloc[:,4].value_counts()

stops
one            250863
zero            36004
two_or_more     13286
Name: count, dtype: int64

In [13]:
df.shape

(300153, 11)

In [14]:
df.isna().sum()

airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [15]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
airline,300153,6,Vistara,127859
flight,300153,1561,UK-706,3235
source_city,300153,6,Delhi,61343
departure_time,300153,6,Morning,71146
stops,300153,3,one,250863
arrival_time,300153,6,Night,91538
destination_city,300153,6,Mumbai,59097
class,300153,2,Economy,206666


In [16]:
df['duration'].describe()

count    300153.000000
mean         12.221021
std           7.191997
min           0.830000
25%           6.830000
50%          11.250000
75%          16.170000
max          49.830000
Name: duration, dtype: float64

In [17]:
df['days_left'].describe()

count    300153.000000
mean         26.004751
std          13.561004
min           1.000000
25%          15.000000
50%          26.000000
75%          38.000000
max          49.000000
Name: days_left, dtype: float64

In [18]:
X = pipe.fit_transform(df.drop(columns=['price']))
y = df['price']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train dataset shape: ")
print(f"Features: {X_train.shape}")
print(f"Label: {y_train.shape}")

print("Test dataset shape: ")
print(f"Features: {X_test.shape}")
print(f"Label: {y_test.shape}")

Train dataset shape: 
Features: (240122, 1595)
Label: (240122,)
Test dataset shape: 
Features: (60031, 1595)
Label: (60031,)


In [21]:
# X_train.info()

In [None]:
rf_model = RandomForestRegressor(n_estimators=10, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Create a score matrix (DataFrame)
score_matrix = pd.DataFrame({
    'R2 Score': [r2],
    'MAE': [mae],
    'MSE': [mse],
    'RMSE': [rmse]
})

display(score_matrix)