In [1]:
!pip install xgboost

Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl (190.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.1+cu121 requires nvidia-cublas-cu12==12.1.3.1; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.1+cu121 requires nvidia-cuda-cupti-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.1+cu121 requires nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", which is not installed

In [2]:
# Importing the required packages
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
# Firstly, Preparing training data

train_df = pd.read_csv("./train_data.csv")
train_df

FileNotFoundError: [Errno 2] No such file or directory: './train_data.csv'

In [None]:
# Shape of the training dataframe
train_df.shape

In [None]:
# Info about training dataframe
train_df.info()

In [None]:
# Describtion on the training dataframe
train_df.describe()

In [None]:
str_columns =  ["airline", 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

# LabelEncoder
label_encoder = LabelEncoder()


for column in str_columns:
    train_df[column] = label_encoder.fit_transform(train_df[column])

train_df

In [None]:
# Preparing the test data
test_df = pd.read_csv('./test_data.csv')
test_df

In [None]:
# Shape of the test dataframe
test_df.shape

In [None]:
# Info about test dataframe
test_df.info()

In [None]:
# Describtion on the test dataframe
test_df.describe()

In [None]:
# LabelEncoder
label_encoder = LabelEncoder()


for column in str_columns:
    test_df[column] = label_encoder.fit_transform(test_df[column])

test_df

In [None]:
# Splitting the data
X_train = train_df.iloc[:, :-1]  # All rows, all columns except the last
y_train = train_df.iloc[:, -1]   # All rows, last column

# Separate features and labels for test data
X_test = test_df

In [None]:
# Training models
knn = KNeighborsClassifier()

gbm = GradientBoostingClassifier()

# Hyperparameter grid for KNN
knn_param_grid = {
    'n_neighbors': range(1, 25),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Hyperparameter grid for GradientBoosting
gbm_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

In [None]:
# Initialize GridSearchCV for KNN
knn_grid_search = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
knn_grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters for KNN:", knn_grid_search.best_params_)
print("Best Score for KNN:", knn_grid_search.best_score_)

In [None]:
# Initialize GridSearchCV for GradientBoosting
gbm_grid_search = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
gbm_grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters for Gradient Boosting:", gbm_grid_search.best_params_)
print("Best Score for Gradient Boosting:", gbm_grid_search.best_score_)

In [None]:
# Get the best models from grid search
best_knn = knn_grid_search.best_estimator_
best_gbm = gbm_grid_search.best_estimator_

# Initialize Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('knn', best_knn),
    ('gbm', best_gbm)
], voting='soft')  # 'soft' uses predicted probabilities for voting

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)


In [None]:
# Make predictions with the best KNN model
knn_predictions = best_knn.predict(X_test)

# Make predictions with the best Gradient Boosting model
gbm_predictions = best_gbm.predict(X_test)

# Make predictions with the Voting Classifier
voting_predictions = voting_clf.predict(X_test)

In [None]:
# Evaluation metrics for KNN
print("KNN Evaluation:")
print("Accuracy Score:", accuracy_score(X_test, knn_predictions))
print("Classification Report:\n", classification_report(X_test, knn_predictions))
print("Cross-Val Score:", cross_val_score(best_knn, X_train, y_train, cv=5).mean())
print("RMSE:", mean_squared_error(X_test, knn_predictions, squared=False))
print("MAE:", mean_absolute_error(X_test, knn_predictions))

In [None]:
# Evaluation metrics for Gradient Boosting
print("\nGradient Boosting Evaluation:")
print("Accuracy Score:", accuracy_score(y_test, gbm_predictions))
print("Classification Report:\n", classification_report(y_test, gbm_predictions))
print("Cross-Val Score:", cross_val_score(best_gbm, X_train, y_train, cv=5).mean())
print("RMSE:", mean_squared_error(y_test, gbm_predictions, squared=False))
print("MAE:", mean_absolute_error(y_test, gbm_predictions))

In [None]:
# Evaluation metrics for Voting Classifier
print("\nVoting Classifier Evaluation:")
print("Accuracy Score:", accuracy_score(X_test, voting_predictions))
print("Classification Report:\n", classification_report(X_test, voting_predictions))
print("Cross-Val Score:", cross_val_score(voting_clf, X_train, y_train, cv=5).mean())
print("RMSE:", mean_squared_error(X_test, voting_predictions, squared=False))
print("MAE:", mean_absolute_error(X_test, voting_predictions))