In [4]:
!pip install lightgbm



In [5]:
# Importing the required packages
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [6]:
# Firstly, Prepare training data

train_df = pd.read_csv("./train_data.csv")
train_df

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
1,2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
2,3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
3,4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
4,5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.00,4,10712
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,Indigo,6E-6178,Bangalore,Night,one,Early_Morning,Mumbai,Economy,7.92,45,3153
19996,19997,AirAsia,I5-582,Kolkata,Morning,one,Afternoon,Delhi,Economy,5.83,24,3911
19997,19998,Vistara,UK-832,Chennai,Early_Morning,two_or_more,Evening,Bangalore,Economy,35.33,17,14822
19998,19999,Vistara,UK-996,Mumbai,Evening,one,Morning,Bangalore,Economy,16.33,21,6450


In [7]:
# Shape of the training dataframe
train_df.shape

(20000, 12)

In [8]:
# Info about training dataframe
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                20000 non-null  int64  
 1   airline           20000 non-null  object 
 2   flight            20000 non-null  object 
 3   source_city       20000 non-null  object 
 4   departure_time    20000 non-null  object 
 5   stops             20000 non-null  object 
 6   arrival_time      20000 non-null  object 
 7   destination_city  20000 non-null  object 
 8   class             20000 non-null  object 
 9   duration          20000 non-null  float64
 10  days_left         20000 non-null  int64  
 11  price             20000 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 1.8+ MB


In [9]:
# Describtion on the training dataframe
train_df.describe()

Unnamed: 0,id,duration,days_left,price
count,20000.0,20000.0,20000.0,20000.0
mean,10000.5,12.177627,25.92415,20960.2817
std,5773.647028,7.157944,13.624874,22775.459535
min,1.0,0.83,1.0,1105.0
25%,5000.75,6.83,14.0,4783.0
50%,10000.5,11.25,26.0,7425.0
75%,15000.25,16.08,38.0,42521.0
max,20000.0,38.58,49.0,114523.0


In [10]:
# Is there some NaN values in trainset. Let's fix them.
train_df.isnull().sum()

Unnamed: 0,0
id,0
airline,0
flight,0
source_city,0
departure_time,0
stops,0
arrival_time,0
destination_city,0
class,0
duration,0


In [11]:
# I found little number of NaN values, thats why I am using bfill instead of better methods
train_df.fillna(method='bfill')

  train_df.fillna(method='bfill')


Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
1,2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
2,3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
3,4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
4,5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.00,4,10712
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,Indigo,6E-6178,Bangalore,Night,one,Early_Morning,Mumbai,Economy,7.92,45,3153
19996,19997,AirAsia,I5-582,Kolkata,Morning,one,Afternoon,Delhi,Economy,5.83,24,3911
19997,19998,Vistara,UK-832,Chennai,Early_Morning,two_or_more,Evening,Bangalore,Economy,35.33,17,14822
19998,19999,Vistara,UK-996,Mumbai,Evening,one,Morning,Bangalore,Economy,16.33,21,6450


In [12]:
str_columns =  ["airline", 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

# LabelEncoder
label_encoder = LabelEncoder()


for column in str_columns:
    train_df[column] = label_encoder.fit_transform(train_df[column])

train_df

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,1,5,1214,0,1,0,5,5,1,14.25,21,7212
1,2,4,1126,3,2,2,5,4,1,1.75,7,5292
2,3,5,1245,0,4,0,2,2,0,9.58,5,60553
3,4,5,1191,4,4,0,2,3,1,6.75,28,5760
4,5,3,275,1,1,2,4,5,1,2.00,4,10712
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,3,329,0,5,0,1,5,1,7.92,45,3153
19996,19997,0,999,4,4,0,0,2,1,5.83,24,3911
19997,19998,5,1235,1,1,1,2,0,1,35.33,17,14822
19998,19999,5,1309,5,2,0,4,0,1,16.33,21,6450


In [13]:
# Preparing the test data
test_df = pd.read_csv('./test_data.csv')
test_df

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,1,Air_India,AI-765,Kolkata,Evening,one,Night,Delhi,Business,28.25,2
1,2,Vistara,UK-747,Delhi,Early_Morning,one,Night,Mumbai,Business,13.83,34
2,3,Air_India,AI-570,Mumbai,Early_Morning,zero,Early_Morning,Chennai,Business,2.00,30
3,4,AirAsia,I5-974,Hyderabad,Night,one,Late_Night,Delhi,Economy,5.17,26
4,5,Air_India,AI-770,Kolkata,Night,one,Afternoon,Mumbai,Economy,16.33,35
...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,Air_India,AI-768,Kolkata,Afternoon,one,Morning,Bangalore,Business,17.42,15
4996,4997,Indigo,6E-6214,Kolkata,Morning,zero,Afternoon,Mumbai,Economy,3.00,40
4997,4998,Air_India,AI-402,Kolkata,Morning,one,Night,Mumbai,Business,11.17,37
4998,4999,Air_India,AI-673,Mumbai,Early_Morning,one,Night,Hyderabad,Business,13.33,38


In [14]:
# Shape of the test dataframe
test_df.shape

(5000, 11)

In [15]:
# Info about test dataframe
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                5000 non-null   int64  
 1   airline           5000 non-null   object 
 2   flight            5000 non-null   object 
 3   source_city       5000 non-null   object 
 4   departure_time    5000 non-null   object 
 5   stops             5000 non-null   object 
 6   arrival_time      5000 non-null   object 
 7   destination_city  5000 non-null   object 
 8   class             5000 non-null   object 
 9   duration          5000 non-null   float64
 10  days_left         5000 non-null   int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 429.8+ KB


In [16]:
# Describtion on the test dataframe
test_df.describe()

Unnamed: 0,id,duration,days_left
count,5000.0,5000.0,5000.0
mean,2500.5,12.328838,26.0156
std,1443.520003,7.306348,13.692409
min,1.0,0.83,1.0
25%,1250.75,6.83,14.0
50%,2500.5,11.25,26.0
75%,3750.25,16.25,38.0
max,5000.0,40.67,49.0


In [17]:
# Testset is clear out of NaN values
test_df.isnull().sum()

Unnamed: 0,0
id,0
airline,0
flight,0
source_city,0
departure_time,0
stops,0
arrival_time,0
destination_city,0
class,0
duration,0


In [18]:
# LabelEncoder
label_encoder = LabelEncoder()


for column in str_columns:
    test_df[column] = label_encoder.fit_transform(test_df[column])

test_df

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,1,1,452,4,2,0,5,2,0,28.25,2
1,2,5,834,2,1,0,5,5,0,13.83,34
2,3,1,401,5,1,2,1,1,0,2.00,30
3,4,0,723,3,5,0,3,2,1,5.17,26
4,5,1,456,4,5,0,0,5,1,16.33,35
...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,1,455,4,0,0,4,0,0,17.42,15
4996,4997,3,221,4,4,2,0,5,1,3.00,40
4997,4998,1,344,4,4,0,5,5,0,11.17,37
4998,4999,1,432,5,1,0,5,3,0,13.33,38


In [19]:
# Splitting the data
X_train = train_df.iloc[:, :-1]  # All rows, all columns except the last
y_train = train_df.iloc[:, -1]   # All rows, last column

# Separate features and labels for test data
X_test = test_df

In [20]:
# Training models
lgbm = LGBMClassifier()

gbm = GradientBoostingClassifier()

# Hyperparameter grid for LightBGM
lgbm_param_grid = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'num_leaves': [31, 50, 100, 131],
    'boosting_type': ['gbdt', 'dart']
}

# Hyperparameter grid for GradientBoosting
gbm_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.8, 0.9, 1.0, 1.1]
}

In [21]:
# Initialize GridSearchCV for GradientBoosting
gbm_grid_search = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
gbm_grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters for Gradient Boosting:", gbm_grid_search.best_params_)
print("Best Score for Gradient Boosting:", gbm_grid_search.best_score_)



KeyboardInterrupt: 

In [None]:
# Initialize GridSearchCV for LightGradientBoosting
lgbm_grid_search = GridSearchCV(estimator=lgbm, param_grid=lgbm_param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
lgbm_grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters for LightGBM:", lgbm_grid_search.best_params_)
print("Best Score for LightGBM:", lgbm_grid_search.best_score_)

In [None]:
# Get the best models from grid search
best_lgbm = lgbm_grid_search.best_estimator_
best_gbm = gbm_grid_search.best_estimator_

# Initialize Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('lgbm', best_lgbm),
    ('gbm', best_gbm)
], voting='soft')  # 'soft' uses predicted probabilities for voting

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)


In [None]:
# Make predictions with the best LightBGM model
lgbm_predictions = best_lgbm.predict(X_test)

# Make predictions with the best Gradient Boosting model
gbm_predictions = best_gbm.predict(X_test)

# Make predictions with the Voting Classifier
voting_predictions = voting_clf.predict(X_test)

In [None]:
# Evaluation metrics for LightGBM
print("\nLightGBM Evaluation:")
print("Accuracy Score:", accuracy_score(X_test, lgbm_predictions))
print("Classification Report:\n", classification_report(X_test, lgbm_predictions))
print("Cross-Val Score:", cross_val_score(best_lgbm, X_train, y_train, cv=5).mean())
print("RMSE:", mean_squared_error(X_test, lgbm_predictions, squared=False))
print("MAE:", mean_absolute_error(X_test, lgbm_predictions))

In [None]:
# Evaluation metrics for Gradient Boosting
print("\nGradient Boosting Evaluation:")
print("Accuracy Score:", accuracy_score(X_test, gbm_predictions))
print("Classification Report:\n", classification_report(X_test, gbm_predictions))
print("Cross-Val Score:", cross_val_score(best_gbm, X_train, y_train, cv=5).mean())
print("RMSE:", mean_squared_error(X_test, gbm_predictions, squared=False))
print("MAE:", mean_absolute_error(X_test, gbm_predictions))

In [None]:
# Evaluation metrics for Voting Classifier
print("\nVoting Classifier Evaluation:")
print("Accuracy Score:", accuracy_score(X_test, voting_predictions))
print("Classification Report:\n", classification_report(X_test, voting_predictions))
print("Cross-Val Score:", cross_val_score(voting_clf, X_train, y_train, cv=5).mean())
print("RMSE:", mean_squared_error(X_test, voting_predictions, squared=False))
print("MAE:", mean_absolute_error(X_test, voting_predictions))